def main(): import urlOpen text = urlOpen.get_html("http://www.interfax.ru/world/502926") text = tags_filter(text) with open("parsed3.html", mode='w', encoding='utf-8') as file: file.write(text) max_char, min_char, line_list = get_list_of_lines(text) for i, c in enumerate(line_list): level = max_char * 0.4 if c[0] >= level and not c[2].startswith("Copyright"): print(i, c[0], c[1], c[2]) print("Max chars in line: {}\n" "Min chars in line {}\n" "Num of lines {}".format(max_char, min_char, len(line_list)))
text = urlOpen.get_html("http://www.interfax.ru/world/502926") text = tags_filter(text) with open("parsed3.html", mode='w', encoding='utf-8') as file: file.write(text) max_char, min_char, line_list = get_list_of_lines(text) for i, c in enumerate(line_list): level = max_char * 0.4 if c[0] >= level and not c[2].startswith("Copyright"): print(i, c[0], c[1], c[2]) print("Max chars in line: {}\n" "Min chars in line {}\n" "Num of lines {}".format(max_char, min_char, len(line_list))) if __name__ == "__main__": # main() import urlOpen html_code = urlOpen.get_html("http://ria.ru/world/20160406/1403678547.html") if html_code: with open("parsed2.html", mode='w', encoding='utf-8') as file: file.write(tags_filter_head_and_script(html_code)) text = get_text_from_html(html_code) with open("parsed3.html", mode='w', encoding='utf-8') as file: file.write(text)