예제 #1
0
def step5():
    # 根據主路境內的檔案產生摘要
    # # res = ['d0_sg0', 'd3_sg20', 'd0_sg1', 'd0_sg4', 'd0_sg5', 'd0_sg6', 'd0_sg9', 'd0_sg10', 'd0_sg11', 'd0_sg12', 'd0_sg13', 'd0_sg16', 'd0_sg18', 'd0_sg19', 'd0_sg20', 'd0_sg22', 'd0_sg26', 'd0_sg28', 'd0_sg31', 'd0_sg32', 'd0_sg33', 'd0_sg34', 'd0_sg35', 'd0_sg38', 'd0_sg39', 'd0_sg40', 'd0_sg41', 'd4_sg39', 'd6_sg21', 'd8_sg11', 'd14_sg72', 'd15_sg4', 'd16_sg1', 'd16_sg2', 'd16_sg3', 'd16_sg9', 'd16_sg30', 'd16_sg6', 'd16_sg7', 'd16_sg11', 'd16_sg13', 'd16_sg15', 'd16_sg16', 'd16_sg18', 'd16_sg19', 'd16_sg21', 'd16_sg26', 'd16_sg28', 'd16_sg14', 'd16_sg29', 'd16_sg5', 'd16_sg0', 'd16_sg20', 'd16_sg27']
    for sim_type in ["tf_pdf", "tf_idf", "simple"]:
        for file_number in range(0, 19):
            main_path = {}

            with open(
                    "group_data_%s/%s/main_path.json" %
                (sim_type, file_number), "r") as file:
                main_path = json.load(file)

            ref_path = "group_data_%s/%s/group_%s.json" % (
                sim_type, file_number, file_number)

            # 確認是否有特定路徑,沒有就建立一個,並將資料儲存在裡面
            main_path_summary_folder = "main_path_summary/%s/" % sim_type
            main_path_summary_folder = os.path.join(os.path.dirname(__file__),
                                                    main_path_summary_folder)
            # print(main_path_summary_folder)
            if not os.path.exists(main_path_summary_folder):
                os.mkdir(main_path_summary_folder)

            for i in range(1, 100):
                if main_path[str(i)]:
                    res = main_path[str(i)]
                    summary = system_summary(res, ref_path)
                    # print(summary)
                    log("%s,%s" % (file_number, i), lvl="w")
                    summary_path = "main_path_summary/%s/%s_%s.txt" % (
                        sim_type, file_number, i)
                    with open(summary_path, 'w', encoding='utf8') as file:
                        file.write(summary)
예제 #2
0
def step2():
    # 根據各群組的資料進行相似度比較, 最後產生group_' + str(group_day) + '_%s.json檔案
    for group_day in range(19):
        DATA = {}
        with open("group_" + str(group_day) + ".json") as file:
            DATA = json.load(file)
        log('group_day: %s' % group_day, lvl='w')
        relation = relation_analysis(DATA, group_day, "tf_pdf")
예제 #3
0
def threshold_test(json_file_path, simple_testing=None):
    start = 0
    stop = 0
    add = 0
    if simple_testing is not None and type(simple_testing) is tuple:
        start = simple_testing[0]
        stop = simple_testing[1]
        add = simple_testing[2]

    # 預讀資料到記憶體
    json_file = {}
    with open(json_file_path, 'r') as f:
        json_file = json.load(f)

    # 建立一個暫存的list
    # columns=['day', 'type', 'accuracy', 'precision','recall','f1-score','threshold']
    TEMP_RESULT = []

    # analysis
    daily_data = json_file['daily_data']
    for daily in daily_data:
        log('process day %s data' % daily['day'], lvl='i')
        TEMP_RESULT_COS = metrics_value(day=daily['day'],
                                        type='cos',
                                        testing_data=daily['cos'],
                                        real_data=daily['real_data'],
                                        start=start,
                                        add=add,
                                        stop=stop)
        TEMP_RESULT.extend(TEMP_RESULT_COS)
        TEMP_RESULT_TF_IDF = metrics_value(day=daily['day'],
                                           type='tf_idf',
                                           testing_data=daily['tf_idf'],
                                           real_data=daily['real_data'],
                                           start=start,
                                           add=add,
                                           stop=stop)
        TEMP_RESULT.extend(TEMP_RESULT_TF_IDF)
        TEMP_RESULT_TF_PDF = metrics_value(day=daily['day'],
                                           type='tf_pdf',
                                           testing_data=daily['tf_pdf'],
                                           real_data=daily['real_data'],
                                           start=start,
                                           add=add,
                                           stop=stop)
        TEMP_RESULT.extend(TEMP_RESULT_TF_PDF)

    log(TEMP_RESULT)

    # 將資料寫到dataframe並且輸出
    df = pandas.DataFrame(TEMP_RESULT,
                          columns=[
                              'day', 'type', 'accuracy', 'precision', 'recall',
                              'f1-score', 'threshold'
                          ])
    csv_file_path = os.path.join(os.path.dirname(json_file_path),
                                 'analysis.csv')
    df.to_csv(csv_file_path, sep=',', encoding='utf-8')
def words_matrix_tf_pdf_compare(words_matrix, daily_data_len, compare_day_len,
                                channel_list):
    result = []
    words_matrix = tf_pdf(words_matrix, channel_list)
    log(words_matrix)
    # 因為會多出第一列是所有的文字資料,所以計算的時候要扣除第一行
    for i in range(1, daily_data_len + 1):
        for j in range(1 + daily_data_len,
                       compare_day_len + daily_data_len + 1):
            result.append(cosines(words_matrix[i], words_matrix[j]))
    return result
def idf(array):
    log("[idf][start]",lvl="i")
    # 建立資料idf陣列資料
    idf_array = copy.deepcopy(array)
    # 建立暫存的資料做idf運算
    temp = copy.deepcopy(idf_array)
    # 總文件數
    file_count = len(idf_array)-1
    word_count = len(idf_array[0])
    # 先進行轉置
    # temp = [[row[i] for row in temp] for i in range(len(temp[0]))]
    # 顯示暫存資料的內容,確定已經轉置
    log(np.asarray(temp))

    for word in range(word_count):
        word_in_file = 0
        for i in range(1,file_count+1):
            if int(idf_array[i][word]) > 0:
                word_in_file = word_in_file+1

        for j in range(1,file_count+1):
            # normal
            temp[j][word] = abs(np.log10(file_count/(1+word_in_file)))

    log("[idf] \n%s" % np.asarray(temp))
    log("[idf][end]\n", lvl="i")
    return temp
def main():
    answer_file = "arrange_day_0/answer.json"
    answer = {}
    with open(answer_file, 'r') as file:
        answer = json.load(file)

    res = []
    # for i in range(1, 100):
    for i in [17]:
        pre = process_file(
            "arrange_day_0/final_group_file_reference/{}_tfidf.json".format(i))
        log(i, lvl='i')
        res.append(metrics_value(answer, pre))
    df = pd.DataFrame(res)
예제 #7
0
 def __init__(self, root_directory):
     #Setting up the System logs
     self.logger_object_system = log(
         root_directory,
         os.path.basename(__file__).replace(".py", ""), True)
     self.system_logger = self.logger_object_system.get_system_logger()
     #Setting up Application logs
     self.logger_object_application = log(
         root_directory,
         os.path.basename(__file__).replace(".py", ""), False)
     self.application_logger = self.logger_object_application.get_application_logger(
     )
     if len(self.application_logger.handlers) < 2:
         self.logger_object_application.add_stream_handler(
             self.application_logger, Log_level.INFO)
def get_file_list(path,pattern=None):
    log('[get_file_list][start] path: %s, pattern: %s' % (path,pattern), lvl='i')
    file_list = []
    for dir, subdir, files in os.walk(path):
        log('[get_file_list] dir: %s, subdir: %s, files: %s' % (dir,subdir,files))
        for file in files:
            if pattern!=None:
                if re.findall(pattern,os.path.join(dir,file)):
                    file_list.append(os.path.join(dir, file))
            else:
                file_list.append(os.path.join(dir,file))

    log(file_list)
    file_list.sort()
    log('[get_file_list] list: %s' % str(file_list))
    log('[get_file_list][end] path: %s, pattern: %s' % (path,pattern), lvl='i')
    return file_list
예제 #9
0
 def __init__(self, root_directory, payload_path, application_logger):
     self.root_directory = root_directory
     self.payload_path = self.root_directory + "/" + payload_path
     self.application_logger = application_logger
     self.logger_object_system = log(
         root_directory,
         os.path.basename(__file__).replace(".py", ""), True)
     self.system_logger = self.logger_object_system.get_system_logger()
def create_document(story_list,story_name):
    log('[create_document][start]', lvl='i')
    log('[create_document] story_list: %s\n' % str(story_list))

    j=0 #用來記錄某一個matrix中,共有多少文件
    for i in story_list:
        dist = '%s%s%s_%s_%s.txt'%(current_dir,'/news_documents/day',i[2],story_name,j)
        copyfile(i[0],dist)
        j=j+1
        log('[create_document] story name: %s, dist: %s' % (story_name, dist))
    log('[create_document][end]', lvl='i')
def group_file_list(group_list,threshold):
    GROUP_FILE_LISTS = {}
    group_file_path = "/Users/yu_hsuan_chen/PycharmProjects/MEMDS/arrange_day_0/first_clusting_result.json"
    file_reference_path = "/Users/yu_hsuan_chen/PycharmProjects/MEMDS/arrange_day_0/file_reference.json"
    DICT = read_json(group_file_path)
    for i in group_list:
        GROUP_FILE_LISTS[i]=[]
        for j in group_list[i]:
            # print("j: %s" % j)
            res = find_files(j,DICT)
            GROUP_FILE_LISTS[i].extend(res)

    # 最後分群出來的檔案們
    log("GROUP_FILE_LISTS:\n%s" % GROUP_FILE_LISTS,lvl="i")

    # 儲存一份還沒轉換前的資料
    group_file_result_path = "/Users/yu_hsuan_chen/PycharmProjects/MEMDS/arrange_day_0/final_group_file_reference/"+str(threshold)+"_cos.json"
    with open(group_file_result_path, "w") as file:
        json.dump(GROUP_FILE_LISTS, file)

    # 載入比對的檔案們
    file_reference = []
    with open(file_reference_path,"r") as file:
        file_reference = json.load(file)
    log("file_reference:\n%s" % file_reference, lvl="i")


    # 進行最後的檔案轉換
    for i in GROUP_FILE_LISTS:
        for file_list in range(len(GROUP_FILE_LISTS[i])):
            for reference in file_reference:
                # print(reference[1].replace("\\","/"))
                # print(GROUP_FILE_LISTS[i][file_list])
                pattern = ".*\/(.*.txt)"
                pattern = re.compile(pattern)
                m = re.match(pattern,GROUP_FILE_LISTS[i][file_list])
                if m.group(1) in reference[1].replace("\\","/"):
                    GROUP_FILE_LISTS[i][file_list] = reference[0]
    print(GROUP_FILE_LISTS)

    group_file_result_path = "/Users/yu_hsuan_chen/PycharmProjects/MEMDS/arrange_day_0/final_group_file/"+str(threshold)+"_cos.json"
    with open(group_file_result_path,"w") as file:
        json.dump(GROUP_FILE_LISTS,file)
예제 #12
0
 def __init__(self, root_directory, api_dir, endpint_map,
              application_logger):
     self.root_directory = root_directory
     self.api_dir_path = self.root_directory + "/" + api_dir
     self.endpint_map = endpint_map
     self.application_logger = application_logger
     self.logger_object_system = log(
         root_directory,
         os.path.basename(__file__).replace(".py", ""), True)
     self.system_logger = self.logger_object_system.get_system_logger()
예제 #13
0
 def __init__(self, root_directory, api_dir, system_dir,
              system_information_filename, application_logger):
     self.root_directory = root_directory
     self.api_path = self.root_directory + "/" + api_dir
     self.system_metadata_file_path = self.root_directory + "/" + system_dir + "/" + system_information_filename
     self.logger_object_system = log(
         root_directory,
         os.path.basename(__file__).replace(".py", ""), True)
     self.system_logger = self.logger_object_system.get_system_logger()
     self.application_logger = application_logger
     self.check_metadata_file()
def text_to_vector(corpus_list):
    log("[text_to_vector][start]",lvl="i")
    # 產生獨立字詞
    word_list = []
    for i in range(len(corpus_list)):
        word_list = word_list+corpus_list[i]
    # 轉成小寫文字
    word_list = [word.lower() for word in word_list]
    # 將文字按照順序排序
    word_list = sorted(list(set(word_list)))

    # 建立空的array給字詞計算的結果
    word_array = []
    word_array.append(word_list)

    # 從第一列資料開始比對
    for raw in range(len(corpus_list)):
        temp=[]
        # 先挑第一個字出來比對
        for word in range(len(word_list)):
            # 將原本的資料通通轉小寫,避免後面發生大小寫判別問題
            corpus_list[raw] = [word.lower() for word in corpus_list[raw]]
            temp.append(corpus_list[raw].count(word_list[word]))
        word_array.append(temp)
    log("[text_to_vector]\n%s" % np.asarray(word_array))
    log("[text_to_vector][end]\n", lvl="i")
    return word_array
def diff_day(start_day,now):
    log('[diff_day][start]')
    diff = str(datetime.datetime.strptime(now, "%Y%m%d") - datetime.datetime.strptime(start_day, "%Y%m%d"))
    diff = int(re.match("\d+", diff).group(0))
    log('[diff_day] start date: %s, end date: %s, diff: %s' % (start_day,now,diff))
    log('[diff_day][end]')
    return diff
예제 #16
0
def step3():
    """
    # 如果沒有移動檔案的話可以用這段來移動
    # dir = os.path.dirname(__file__)+"/"
    # for sim_type in ["tf_idf", "tf_pdf", "simple"]:
    #     for i in range(19):
    #         if not os.path.exists("group_data_%s/%s" % (sim_type, i)):
    #             os.mkdir("group_data_%s/%s" % (sim_type, i))
    #         copyfile(dir+"group_data_%s/group_%s.json" % (sim_type, i), dir+"group_data_%s/%s/group_%s.json" % (sim_type, i, i))
    #         move(dir+"group_data_%s/group_%s_%s.json" % (sim_type, i,sim_type), dir+"group_data_%s/%s/group_%s_%s.json" % (sim_type, i, i,sim_type))
    #         pass
    """
    # 將group_data_%s/%s內的json檔案讀出來,並轉換成pajek 的.net檔案
    for sim_type in ["tf_pdf", "tf_idf", "simple"]:
        for file_number in range(13, 19):
            relation = {}
            with open('group_data_%s/%s/group_%s_%s.json' %
                      (sim_type, file_number, file_number, sim_type),
                      'r',
                      encoding='utf8') as file:
                relation = json.load(file)
            # print(relation)
            log("load file finish")
            res = {}
            for i in range(1, 100):
                print(sim_type, file_number, i)
                log("get_relation_and_draw start")
                get_relation_and_draw(relation, i / 100, str(i), file_number,
                                      sim_type)
                log("get_relation_and_draw end")
def words_matrix_tf_idf_compare(words_matrix, daily_data_len, compare_day_len):
    """
    Description:
        用來計算各group的tf_idf值

    Args:
        words_matrix: 整個文字矩陣的數量資料
        daily_data_file_info: 主要的檔案,是從output_temp['file_info']來的
        compare_day_file_info: 比對的檔案,compare_day_temp['file_info']來的

    Returns:
        result: 將所有的tf_idf資料形成的一維陣列
    """
    result = []
    words_matrix = tf_idf(words_matrix)
    log(words_matrix)
    # 因為會多出第一列是所有的文字資料,所以計算的時候要扣除第一行
    for i in range(1, daily_data_len + 1):
        for j in range(1 + daily_data_len,
                       compare_day_len + daily_data_len + 1):
            result.append(cosines(words_matrix[i], words_matrix[j]))
    return result
def main():
    task = 1
    paths = [
        "textrank_simple",
        "textrank_simple/reference",
        "textrank_simple/system"
    ]

    for path in paths:
        if not os.path.exists(path):
            os.mkdir(path)

    for file_group in range(0,1):
        source = "group_data/{file_group}/group_{file_group}.json".format(file_group=file_group)
        # print(source)
        dic = ""
        with open(source, "r", encoding="utf8") as file:
            group_data = json.load(file)
        dic = group_data

        content = ""
        for x in dic["source"]:
            content = content + dic["source"][x]

        for i in range(1, 100):
            log("Task: {task}, FileGroup: {file_group}, i: {i}".format(task=task, file_group=file_group, i=i))
            print("summarize")
            ratio = i / 100
            textrank = summarizer.summarize(content, ratio=ratio)

            destination = paths[2] + "/task{task}_group{file_group}th{ratio}.txt".format(task=task,
                                                                                         file_group=file_group, ratio=i)

            with open(destination, "w", encoding="utf8") as file:
                file.write(textrank)

            reference_file(file_group, task, i)
            task = task + 1
            print("reference file done.")
def metrics_value(real_data, system_data):
    check_list = [
        'missile', 'turkish', 'catalan', 'brexit', 'gravitational', 'hk',
        'sewol', 'syria', 'crimea'
    ]
    item_number = 1
    res = []
    for event in check_list:
        real_count = len(real_data[event])
        system_count = len(system_data[event])

        # 如果系統的資料比較少
        if real_count > system_count:
            add = [0 for i in range(real_count - system_count)]
            system_data[event].extend(add)
        elif real_count < system_count:
            add = [0 for i in range(system_count - real_count)]
            real_data[event].extend(add)
        else:
            pass

        log(real_data[event])
        log(system_data[event])

        if item_number not in system_data[event]:
            accuracy = 0.0
            precision = 0.0
            recall = 0.0
            f1_score = 0.0
        else:
            log("xxxx")
            accuracy = metrics.accuracy_score(real_data[event],
                                              system_data[event])
            precision = metrics.precision_score(real_data[event],
                                                system_data[event],
                                                average='micro')
            recall = metrics.recall_score(real_data[event],
                                          system_data[event],
                                          average='micro')
            f1_score = metrics.f1_score(real_data[event],
                                        system_data[event],
                                        average='micro')
        log('{},{},{},{},{}'.format(event, accuracy, precision, recall,
                                    f1_score),
            lvl='i')
        res.extend([accuracy, precision, recall, f1_score])
        item_number = item_number + 1
    return res
예제 #20
0
    def load_document(self, docfile, days):
        # 檔案資料夾
        file_dir = os.path.dirname(docfile)

        # 建立子資料夾
        '''
        output_dir = os.path.splitext(docfile)[0]
        # 建立資料夾
        if not os.path.isdir(output_dir):
            os.mkdir(output_dir)
        '''

        # 讀取all檔案
        with open(docfile, 'r') as docF:
            SG = docF.readlines()

        # 輸出到各檔案
        for x in range(0, len(SG)):
            file_name = file_dir + "\\D" + str(days) + "SG" + str(x +
                                                                  1) + ".txt"
            log("\n[File]: " + file_name)
            with open(file_name, 'w') as SGF:
                res = self.ProcessText(SG[x])
                SGF.write(res)
def process_group_info(daily_data_file_info, compare_day_file_info):
    """
    Description:
        用來計算主要的檔案與被比對的檔案們之間的排列組合

    Args:
        daily_data_file_info: 主要的檔案,是從output_temp['file_info']來的
        compare_day_file_info: 比對的檔案,compare_day_temp['file_info']來的

    Returns:
        process_group: 是個二維陣列,[[第0天的group,第1天的group]]
        daily_data_len: 主要的檔案長度
        compare_day_len: 比對的檔案長度
    """
    log("daily_data_file_info: %s" % daily_data_file_info)
    log("compare_day_file_info: %s" % compare_day_file_info)

    process_group = []
    daily_data_len = len(daily_data_file_info)
    compare_day_len = len(compare_day_file_info)
    for i in range(daily_data_len):
        for j in range(compare_day_len):
            process_group.append([i, j])
    return process_group, daily_data_len, compare_day_len
예제 #22
0
def get_relation_and_draw(relation, threshold, file_name, file_number,
                          sim_type):
    edges = []
    nodes = []
    for i in relation['relation']:
        if i[2] >= threshold:
            edges.append((i[0], i[1]))
            nodes.append(i[0])
            nodes.append(i[1])
    # print(nodes,edges)
    G = nx.DiGraph()
    pos = {}

    pattern = "d(\d+)_sg(\d+)"
    pattern = re.compile(pattern)
    log("add_nodes")
    G.add_nodes_from(nodes)
    for node in nodes:
        m = re.match(pattern, node)
        pos[node] = [int(m.group(1)), int(m.group(2))]
    nx.draw_networkx_nodes(G, pos=pos, nodelist=nodes)
    # print(pos)
    # print(nodes)
    log("add edges")
    G.add_edges_from(edges)
    log("draw net")
    nx.draw_networkx_edges(G, pos)
    nx.draw_networkx_labels(G, pos)
    log("write net")
    nx.write_pajek(
        G,
        os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "group_data_%s/" % sim_type + str(file_number) + "/" +
            str(file_name) + ".net"))

    fig = plt.gcf()
    # fig.set_size_inches(100, 20)
    # plt.axis('off')
    # log("save jpg")
    # plt.savefig(os.path.join(os.path.dirname(os.path.abspath(__file__)),
    #                          "group_data_%s/" % type + str(file_number) + "/" + str(file_name) + ".png"), dpi=100)
    # plt.show()
    plt.cla()
def format_document(story_list,path):
    log('[format_document][start]', lvl='i')
    # 設定worker的語言
    worker = DocToSG('english')
    for i in story_list:
        file = ''
        if type(i) is list:
            file = i[0]
        elif type(i) is str:
            file = i
        log('[format_document] process: %s' % str(file))
        worker.load_document_common(file,path)
    log('[format_document][end]', lvl='i')
def tf(array):
    log("[tf][start]",lvl="i")
    # 建立資料tf資料陣列
    tf_array=copy.deepcopy(array)

    for raw in range(1,len(tf_array)):
        word_cout = sum(tf_array[raw])
        for i in range(len(tf_array[raw])):
            tf_array[raw][i] = tf_array[raw][i]/word_cout
    log("[tf] \n%s" % np.asarray(tf_array))
    log("[tf][end]\n",lvl="i")

    return tf_array
def tf_idf(corpus_list):
    log("[tf_idf][start]", lvl="i")
    vector = text_to_vector(corpus_list)
    tf_vect = tf(vector)
    idf_vect = idf(vector)

    tf_idf_vect = copy.deepcopy(vector)

    for i in range(1,len(tf_idf_vect)):
        for j in range(len(tf_idf_vect[i])):
            # log((i,j))
            tf_idf_vect[i][j] = tf_vect[i][j]*idf_vect[i][j]
            # log((tf_vect[i][j],idf_vect[i][j],tf_idf_veict[i][j]))

    log("[tf_idf] \n%s" % np.asarray(tf_idf_vect))
    log("[tf_idf][end]\n", lvl="i")
    return tf_idf_vect
def run_days(file_list):
    log('[run_days][start]',lvl='i')
    log('[run_days] file_list: %s' % file_list)
    day_list=[]
    tmp = []

    for path in file_list:
        file_name = os.path.basename(path)
        file_name = re.match("\d+",file_name).group(0)
        tmp.append(file_name)

    start_day = min(list(set(tmp)))
    end_day = max(list(set(tmp)))
    diff = diff_day(start_day,end_day)

    for path in file_list:
        file_name = os.path.basename(path)
        day = re.match("\d+",file_name).group(0)
        diff = diff_day(start_day,day)
        day_list.append([path,day,diff])

    log('[run_days] start_day: %s, end_day: %s, diff: %s,\n day_list: %s' % (start_day, end_day, diff, str(day_list)))
    log('[run_days][end]', lvl='i')
    return start_day, end_day, diff, day_list
def main():
    #一班陣列測試資料
    corpus_simaple = [
        ['a','a','b'],
        ['b','a','a']
    ]
    res = simple(corpus_simaple)
    print(res)

    for i in range(1,len(res)):
        for j in range(1,i):
            log("%s, %s" % (i,j))
            print(cosines(res[i],res[j]))

    # tf-idf測試資料
    corpus_tf_idf = [
        ["this", "is", "a", "a","sample"],
        ["this","is","another","another","example","example","example"],
    ]

    # tf-pdf測試資料
    corpus_tf_pdf = [
        ["cat","pet"],
        ["fish","pet"],
        ["cat","eat","fish"],
        ["fish","die"]
    ]
    group = [1,1,2,2]

    tf_idf(corpus_tf_idf)
    log('='*40,lvl='i')
    res = tf_pdf(corpus_tf_pdf,group)

    for i in range(1,len(res)):
        for j in range(1,i):
            log("%s, %s" % (i,j))
            cosines(res[i],res[j])
예제 #28
0
    def ProcessText(self, content):
        content = content.replace("\"", " ").replace(".", " ")
        wordList = content.lower().split(" ")
        _wordList = []
        for word in wordList:
            word = word.strip("\n")
            word = word.replace("\n", " ")
            word = word.strip()
            if string.punctuation in word:
                log("!!!!!!!!!!!!!!!" + word)
            if "\n" in word:
                log("!!!!!!!!!!!!!!!" + word)

            if word in self.stopWords:
                continue

            _word = self.RemovePunctuation(word)
            if _word is None or _word == '':
                continue

            # 過濾最少字元數
            if len(_word) < 4:
                continue

            _word2 = self.RemovePrime(_word)

            _word = self.ChangeForm(_word2)
            if _word is None:
                continue

            # 過濾指定字
            if _word not in self.specialWords:
                res = self.RemovePunctuation(_word)
                _wordList.append(res)
        log(_wordList)
        return ' '.join(_wordList)
def generate_time_data(input,output,day,sources,news_events):
    log('[generate_time_data][start]', lvl='i')
    log('[generate_time_data] input: %s, output: %s, day: %s' % (input,output,day), lvl='i')

    # 將所有的檔案全部找出來
    log('[generate_time_data] find all files and merge to one list')

    all_list = []
    for source in sources:
        for news_event in news_events:
            pattern = '.*'+source+'.*'+news_event+'.*'+'\d+'
            file_list = get_file_list(input,pattern = pattern)
            all_list.extend(file_list)
    log('[generate_time_data] all_list:\n%s' % all_list)

    # 找出各事件所包含的資料,
    event_file_list = []
    for news_event in news_events:
        event_dic = {}
        event_dic['event_name'] = news_event

        # 記錄所有相同事件的檔案路徑
        temp_file_path = []

        for file_name in all_list:
            if news_event in file_name:
                temp_file_path.append(file_name)

        # 紀錄每一個路徑的日期
        temp_file_day = []
        for path in temp_file_path:
            file_name = os.path.basename(path)
            file_name = re.match("\d+", file_name).group(0)
            temp_file_day.append(file_name)

        # 開始時間
        start_day = min(list(set(temp_file_day)))
        # 結束時間
        end_day = max(list(set(temp_file_day)))
        # 總發生時間
        diff = diff_day(start_day, end_day)

        event_dic['start_day'] = start_day
        event_dic['end_day'] = end_day
        event_dic['diff_day'] = diff

        # 紀錄路徑,日期,相差時間,來源,事件名稱
        temp_result = []
        for i in range(len(temp_file_path)):
            # 路徑
            file_path = temp_file_path[i]
            # 日期
            file_day = temp_file_day[i]
            # 相差時間
            file_diff = diff_day(start_day,file_day)
            # 來源
            file_source = ''
            for source in sources:
                if source in temp_file_path[i]:
                    file_source = source
            # 事件
            file_event = news_event

            temp_result.append([file_path,file_day,file_diff,file_source,file_event])

        # 重新計算應該要有的排列時間
        res = arrange_day(temp_result,day)
        event_dic['file_info'] = res

        # 將資料寫入
        event_file_list.append(event_dic)


    output_path = os.path.join(output,'arrange_day_%s.json' % day)
    log('[generate_time_data] out put result to %s' % output_path , lvl='i')
    with open(output_path,'w') as fp:
        json.dump(event_file_list,fp)

    log('[generate_time_data] file info:\n %s' % event_file_list)
    log('[generate_time_data][end]', lvl='i')
    return output_path
def path_is_exists(path):
    log('[path_is_exists][start] Path: %s' % path,lvl='i')
    if not os.path.exists(path):
        log('[path_is_exists] Create the folder "%s"' % path)
        os.makedirs(path)
    log('[path_is_exists][end] Path: "%s"' % path, lvl='i')