def main(head=5, tail=5): # knock1: count lines of file lines = read_file() print("knock1: line count is {0}".format(len(lines))) # knock2: replace tab to space replaced = [ln.replace("\t", " ") for ln in lines] # knock3: split column and save these to file splitted = [ln.split(" ") for ln in replaced] col1 = [ln[0] for ln in splitted] col2 = [ln[1] for ln in splitted] file_tool.write_file(make_path("cat1.txt"), col1) file_tool.write_file(make_path("cat2.txt"), col2) # knock4: reconstruct file from splited files(in knock3) col1_re = file_tool.read_file(make_path("cat1.txt")) col2_re = file_tool.read_file(make_path("cat2.txt")) def trim_cr(line_with_cr): return line_with_cr.replace("\n", "") restructured = [ "\t".join(list(map(trim_cr, columns))) for columns in zip(col1_re, col2_re) ] file_tool.write_file(make_path("re_address.txt"), restructured) knock4_validate = True if len(lines) == len(restructured): for index, line in enumerate(lines): if lines[index] != restructured[index]: knock4_validate = False break else: knock4_validate = False if not knock4_validate: raise Exception("knock4 is failed! check knock1 to 4.") else: print("knock 2 to 4 seems good!") # knock5: show head x lines print("knock5. show head {0} lines.".format(head)) for line in replaced[:head]: print(line) # knock6: show tail x lines print("knock6. show tail {0} lines.".format(tail)) for line in replaced[-tail:]: print(line) # knock7: count kinds of column1 def count_kinds(array): kinds = Counter() for item in array: kinds[item] += 1 return kinds col1_kinds = count_kinds(col1) print("knock7. show counted result of top 10.") print(col1_kinds.most_common(10)) # show top 10 # knock8: sort by column2 sorted_by_col2 = sorted(splitted, key=lambda cols: cols[1]) print("knock8. show sorted result of top {0}.".format(head)) for columns in sorted_by_col2[:head]: print(" ".join(columns)) # knock9: sort by column2 and column1 sorted_by_col2_col1 = sorted(splitted, key=lambda cols: cols[1] + cols[0]) print("knock9. show sorted result of top {0}.".format(head)) for columns in sorted_by_col2_col1[:head]: print(" ".join(columns)) # knock10: count kinds of column2 col2_kinds = count_kinds([trim_cr(x) for x in col2_re]) # use file in knock3 print("knock10. show counted result of top 10.") print(col2_kinds.most_common(10)) # show top 10
def write_file(filename, rows, separator="\t"): path = DATASET_HOME + filename tool.write_file(path, rows, separator)
def main(head=5, tail=5): # knock1: count lines of file lines = read_file() print("knock1: line count is {0}".format(len(lines))) # knock2: replace tab to space replaced = [ln.replace("\t", " ") for ln in lines] # knock3: split column and save these to file splitted = [ln.split(" ") for ln in replaced] col1 = [ln[0] for ln in splitted] col2 = [ln[1] for ln in splitted] file_tool.write_file(make_path("cat1.txt"), col1) file_tool.write_file(make_path("cat2.txt"), col2) # knock4: reconstruct file from splited files(in knock3) col1_re = file_tool.read_file(make_path("cat1.txt")) col2_re = file_tool.read_file(make_path("cat2.txt")) def trim_cr(line_with_cr): return line_with_cr.replace("\n", "") restructured = ["\t".join(list(map(trim_cr, columns))) for columns in zip(col1_re, col2_re)] file_tool.write_file(make_path("re_address.txt"), restructured) knock4_validate = True if len(lines) == len(restructured): for index, line in enumerate(lines): if lines[index] != restructured[index]: knock4_validate = False break else: knock4_validate = False if not knock4_validate: raise Exception("knock4 is failed! check knock1 to 4.") else: print("knock 2 to 4 seems good!") # knock5: show head x lines print("knock5. show head {0} lines.".format(head)) for line in replaced[:head]: print(line) # knock6: show tail x lines print("knock6. show tail {0} lines.".format(tail)) for line in replaced[-tail:]: print(line) # knock7: count kinds of column1 def count_kinds(array): kinds = Counter() for item in array: kinds[item] += 1 return kinds col1_kinds = count_kinds(col1) print("knock7. show counted result of top 10.") print(col1_kinds.most_common(10)) # show top 10 # knock8: sort by column2 sorted_by_col2 = sorted(splitted, key=lambda cols: cols[1]) print("knock8. show sorted result of top {0}.".format(head)) for columns in sorted_by_col2[:head]: print(" ".join(columns)) # knock9: sort by column2 and column1 sorted_by_col2_col1 = sorted(splitted, key=lambda cols: cols[1] + cols[0]) print("knock9. show sorted result of top {0}.".format(head)) for columns in sorted_by_col2_col1[:head]: print(" ".join(columns)) # knock10: count kinds of column2 col2_kinds = count_kinds([trim_cr(x) for x in col2_re]) # use file in knock3 print("knock10. show counted result of top 10.") print(col2_kinds.most_common(10)) # show top 10