def __init__(self, input_url): self.input_url = input_url print "INIT INPUT URL:", self.input_url self.base_url = c_m.mk_url_base(self.input_url) # home_page print "INIT BASE URL:", self.base_url self.domain = c_m.strip_to_domain(self.base_url) print "INIT DOMAIN:", self.domain # base paths: self.static_part_path = "site_data\\" self.pc_specific_part_path = os.path.dirname( os.path.realpath(__file__)) + "\\" self.main_path = self.pc_specific_part_path + self.static_part_path print "INIT MAIN PATH:", self.main_path # dynamic part path: self.domain_folder_name = self.domain + u"\\" self.domain_folder_path = self.main_path + self.domain_folder_name self.page_list_f_name = u"page_list.txt" #site pages' data and data file names destination self.page_list_path = self.main_path + self.domain_folder_name + self.page_list_f_name # module specific paths self.html_folder_name = u"html\\" # retrieved html ouput files folder self.text_folder = u"text\\" # output unique text files folder self.eng_text_folder = u"text_eng\\" # translated text output files folder self.text_folder_path = self.domain_folder_path + self.text_folder self.text_eng_folder_path = self.domain_folder_path + self.eng_text_folder
def laod_tabs(self, adjusted_page_list, eng_text): """ From or according to adjusted page list data sets needed variable values, retrieves original or english page texts, loads tabs with apropriate data in place""" print "IN: laod_tabs" for ind in range(len(adjusted_page_list)): #laod_tabs: setting variables base_url = adjusted_page_list[0][1] url_domain = c_m.strip_to_domain(base_url) domain_folder = url_domain + "\\" page_link_url = adjusted_page_list[ind][1] active_link_url_no_domain = page_link_url.replace(base_url,'') active_link_url_no_domain = self.fit_links_to_50(active_link_url_no_domain) print "laod_tabs: TEST ---------------- active_link_url_no_domain %r" % active_link_url_no_domain self.active_link_url_no_domain_list.append(active_link_url_no_domain) active_tab = self.t_tab_list[ind] #laod_tabs: method supports original and english page text loading set as input for method if eng_text: folder_type = "text_eng\\" page_text_f_name = adjusted_page_list[ind][4] else: folder_type = "text\\" page_text_f_name = adjusted_page_list[ind][3] page_text_f_path = self.main_path + domain_folder + folder_type + page_text_f_name #laod_tabs: LOADING TEXT FROM FILE self.noteb.add(active_tab, text='%s' % (active_link_url_no_domain)) #laod_tabs: tab header text #laod_tabs: INSERTING HYPERLINK TO TAB TEXT WIDGET hyper_page_link = page_link_url + '\n\n' #laod_tabs: empty line between hyperlink and text active_tab.insert(END, hyper_page_link) #laod_tabs: RETRIEVING PAGE TEXT FROM FILE page_text = c_m.simply_read(page_text_f_path) #laod_tabs: INSERTING PAGE TEXT TO TAB TEXT WIDGET active_tab.insert(END, page_text) #laod_tabs: Adding retrieved text to notebook text window #laod_tabs: CONFIGURING INSERTED LINK TO SHOW UP AS PROPER HYPERLINK active_tab.tag_add('hyper', '1.0', '1.%d' % len(page_link_url)) active_tab.tag_config('hyper', foreground='blue') active_tab.tag_bind('hyper', "<Enter>", lambda event, arg=ind: self.mouse_on(event, arg)) active_tab.tag_bind('hyper', "<Leave>", lambda event, arg=ind: self.mouse_of(event, arg)) active_tab.tag_bind('hyper', "<Button-1>", lambda event, arg=page_link_url: self.hyper(event, arg)) #TEST FUNCTIONALITY: if keyword present make the tab visible in GUI page_key_word = ["Om-oss", "om-oss", "om oss", "OM-OSS"] if (page_key_word[0] or page_key_word[1] or page_key_word[2] or page_key_word[3]) in page_link_url: self.noteb.select(active_tab)
def mk_tabs_data_set(self, site_data_entry): """According to site data from 1_ready_for_veri file retrieves page data from page_list file, places homepage as first tab, sorts other according to page text symbol count from biggest down""" print "mk_tabs_data_set: RETRIEVING AND REARANGING PAGE LIST ENTRY DATA" base_url = site_data_entry[3] page_list_f_name = 'page_list.txt' url_domain = c_m.strip_to_domain(base_url) # domain acts as key in filesystem composed of folders named as domains print "FOR DOMAIN: ", url_domain domain_folder = url_domain + "\\" page_list_path = self.main_path + domain_folder + page_list_f_name page_list = c_m.l_of_l_read(page_list_path) print "mk_tabs_data_set: ADJUSTING PAGE LIST" adjusted_page_list = self.mk_adjusted_page_list(page_list) # sorting list according to unique page text length return adjusted_page_list