def mk_eng_txt_files(self, **rewrite):
        """ Takes a list of texts parts feeds to translate,
        gets the translated text,
        stacks it together"""

        print "mk_eng_txt_files: RETRIEVING PAGE_LIST.........."
        page_list = c_m.l_of_l_read(self.page_list_path)

        if len(page_list) < 1:  # handling empty page_list case
            print "mk_eng_txt_files: PAGE LIST IS NOT POPULATED, RUN HTML_File_Maker AND Text_Extractor MODULES FIRST"
        else:  # handling page_list for partially transalted sites
            print "mk_eng_txt_files: IN CASE PAGE LIST ALREADY HAD SOME ENG_TEXT ENTRIES SETTING INITIAL new_page_list TO LAST KNOWN PAGE_LIST VERSION"
            self.new_page_list = copy(page_list)

        # iterating throug unique text per page txts
        for data_set in page_list:
            self.p_text_f_name = data_set[2]
            print "mk_eng_txt_files: TRANSLATING TEXT FROM FILE %s" % self.p_text_f_name

            self.eng_p_text_f_name = "eng_" + self.p_text_f_name
            self.eng_p_text_f_path = self.text_eng_folder_path + self.eng_p_text_f_name

            self.page_text = c_m.simply_read(self.text_folder_path,
                                             self.p_text_f_name)

            # if page has less than 10 symbols it is not translated
            if len(self.page_text) < 10:
                print "mk_eng_txt_files: NOT WORTH TRANSLATING, WRITING AS IS AND SKIPPING..."
                c_m.simply_write(self.page_text, self.eng_p_text_f_path)

            elif len(self.page_text) > self.max_page_length:
                print "mk_eng_txt_files: PAGE TEXT IS TOO LONG DEVIDING TO PARTS, TRANSLATING AND GETTING BACK FULL PAGE TEXT"
                text_output = self.get_text_parts(**rewrite)

            else:  # 10 < len(page_text) < 2000

                if rewrite["rewrite"]:
                    print "mk_eng_txt_files: TRANSLATING IN REWRITE MODE"
                    text_output = self.get_text()

                elif not os.path.exists(self.eng_p_text_f_path):
                    print "mk_eng_txt_files: TRANSLATING IN ONLY ONCE MODE"
                    text_output = self.get_text()

                else:
                    print "mk_eng_txt_files: SKIPPING FILE, ALREADY TRANSLATED"
                    # continue

            # print "WRITING TRANSLATED OUTPUT TO FILE: ", self.eng_p_text_f_name
            # c_m.simply_write(text_output, self.text_eng_folder_path, self.eng_p_text_f_name)
                data_set.append(self.eng_p_text_f_name
                                )  # updating dataset with eng_text file name
                self.new_page_list.append(
                    data_set)  # updating page list with updated entry

        print "mk_eng_txt_files: DONE TRANSLATING SITE %s " % self.domain
        print "mk_eng_txt_files: UPDATING PAGE LIST WITH ENG TEXT FILE NAMES"
        c_m.l_of_l_write(self.new_page_list, self.page_list_path)
        print "mk_eng_txt_files: SITE TRANSLATION FINISHED, CLOSING CHROME WEBDIRVER"
        self.loaded_driver.quit()
示例#2
0
    def mk_htmls(self):
        """ Makes unicode local html copies
        creates page_list file with file_name and respective link url as data entries"""

        domain_link_list = self.scrape_domain_int_links()
        # driving links, to collect htmls from web and create htmls with removed img elements
        for ind in range(len(domain_link_list)):
            page_file_name = "page_" + str(
                ind +
                1) + ".html"  # gen file names for html file of page on site
            active_page_link = domain_link_list[ind]
            if ";;;" in active_page_link:  # separator problems hedging
                self.links_contain_seprator.append(
                    [active_page_link, page_file_name])
                print "LINK CONTAINS SEPARATOR, CAN NOT ADD TO PAGE LIST, WILL RUIN DATASYSTEM, WILL BE ADDED TO SEPARATE FILE"
            else:
                BS_object = self.retrieve_html(active_page_link,
                                               self.domain_folder_name,
                                               self.html_folder_name,
                                               page_file_name)
                self.page_list.append([active_page_link, page_file_name])
                self.bs_object_dict[active_page_link] = BS_object
        # if no ";;;" separator found in link, then write as usual
        if len(self.links_contain_seprator) > 0:
            c_m.l_of_l_write(self.links_contain_seprator, self.main_path,
                             self.domain_folder_name,
                             self.link_contains_separator)
        else:
            c_m.l_of_l_write(self.page_list, self.main_path,
                             self.domain_folder_name, self.page_list_f_name)
        return self.bs_object_dict
示例#3
0
    def drive_input_links(self):
        # driving links
        input_links_nested_list = [[], []]

        while len(input_links_nested_list) > 0:
            # retieving input_link_list
            input_links_nested_list = c_m.l_of_l_read(self.link_input_path)

            url = input_links_nested_list[0][
                1]  #input link in nested list type data table.
            # will raise out of range error if empty input link file is provided.
            print "drive_input_links: INPUT URL:", url

            # driving modules
            site_selector = Common_Paths(url)  # System module

            # Comment out if you do not need site link management
            site_link_manager_sys = Site_Link_Manager(url)
            # checking if link is duplicate
            is_duplicate = site_link_manager_sys.check_if_duplicate()
            if is_duplicate:
                print "drive_input_links: GOING TO THE NEXT SITE LINK"
                input_links_nested_list.pop(0)
                c_m.l_of_l_write(input_links_nested_list, self.link_input_path)
                continue
            print "drive_input_links: DUBLI CHECK DONE"
            #/

            # Comment out if you do not need html retrieval
            html_maker = HTML_File_Maker(url)
            site_bs_object_dict = html_maker.mk_htmls(
            )  # output html bs_object dict for next module
            print "drive_input_links: HTML FILES DONE BS OBJECTS READY"
            #/

            # Comment out if you do not need text extraction
            txt_maker = Text_Extractor(url)
            txt_maker.mk_text_files(site_bs_object_dict)
            print "drive_input_links: TEXT FILE MAKER DONE"
            #/

            # Comment out if you do not need english translation
            translator = Translator(url)
            translator.mk_eng_txt_files()
            print "drive_input_links: TRANSLATOR DONE"
            print "drive_input_links: ADDING TO VERI GOOD FILE"
            #/

            # Comment out if you do not need site link management
            site_link_manager_sys.add_to_veri_good()
            print "drive_input_links: LINK DONE, POPING..."
            input_links_nested_list.pop(0)
            print "drive_input_links: UPDATING INPUT LINK DATA"
            c_m.l_of_l_write(input_links_nested_list, self.link_input_path)
示例#4
0
    def react_to_highlighting_request(self, event):

        test_input_key_list = self.read_key_input_field()

        print "IN: react_to_highlighting_request"
        print "react_to_highlighting_request: COMPARING INIT KEYWORD LIST WITH A LIST AFTER \"RETURN\" BINDED ACTION"

        if len(test_input_key_list) > len(self.test_hi_key_list):
            print "react_to_highlighting_request: NEW HIGHLIGHT KEYs ADDED TO HIGHLIGHT LIST"
            #react_to_highlighting_request: find added keywords and add to key list
            added_key_list = []
            for key in test_input_key_list:
                #react_to_highlighting_request: filtering out empty keys and leaving which are not in previous list
                if key not in self.test_hi_key_list and len(key) > 0:
                    added_key = key

                    added_key_list.append(
                        added_key
                    )  # highlight_keys_in_site tekes in keys as list

            print "react_to_highlighting_request: HIGHLIGHTING KEYS %r" % added_key_list
            self.highlight_keys_in_site(added_key_list)

            #react_to_highlighting_request: updating key list in local file
            self.init_h_key_list[0] = test_input_key_list
            c_m.l_of_l_write(self.init_h_key_list, self.all_hlights_f_path)
            self.test_hi_key_list = test_input_key_list

        elif len(test_input_key_list) == len(
                self.test_hi_key_list):  # enter presed without adding anything
            print "react_to_highlighting_request: ENTER PRESSED WITHOUT ADDING ANY HIGHLIGHTS"
            self.write_key_input_field(test_input_key_list,
                                       True)  # rewrite True

        else:  #react_to_highlighting_request: details: less keys are present in key input field than previous image of the field - self.test_hi_key_list
            print "react_to_highlighting_request: HIGHLIGHT KEYWORDs DELETED: ",

            #react_to_highlighting_request: need to know which highlight is removed.
            for h_key in self.test_hi_key_list:
                if h_key in test_input_key_list:
                    pass
                else:
                    deleted_key = h_key
                    print deleted_key
                    self.remove_highlight(deleted_key)
            #react_to_highlighting_request: rewriting keys to have corect format
            self.write_key_input_field(test_input_key_list, True)
            #react_to_highlighting_request: write new key list to txt file
            self.init_h_key_list[0] = test_input_key_list
            c_m.l_of_l_write(self.init_h_key_list, self.all_hlights_f_path)
            self.test_hi_key_list = test_input_key_list
    def mk_text_files(self, bs_object_dict):

        print "MAKING UNIQUE PAGE TEXT FILES"
        print "TOTAL PAGES TO BE PROCESSED", len(self.page_list)
        for ind in range(len(self.page_list)):

            copy_page_list = copy(self.page_list)
            poped_page_data_entry = self.page_list[ind]

            #forming page text file name
            split_page_html_f_name = poped_page_data_entry[1].split('.')
            page_text_f_name = split_page_html_f_name[0] + "_text.txt"

            #making a list of site text line, with active page excluded to be able to find unique text lines
            self.page_list[ind].append(page_text_f_name)

            copy_page_list.pop(ind)
            active_page_excluded_txt_l_list = []
            for data_line in copy_page_list:
                active_link = data_line[0]
                active_bs_object = bs_object_dict[active_link]
                page_text_list = self.extract_page_text(active_bs_object)
                active_page_excluded_txt_l_list = active_page_excluded_txt_l_list + page_text_list

            #removing duplicates from almost all text list
            clean_excluded_txt_l_list = c_m.remove_duplicates(
                active_page_excluded_txt_l_list)

            #checking if text lines from active page link are present in other pages
            poped_bs_object = bs_object_dict[poped_page_data_entry[0]]
            poped_page_text_list = self.extract_page_text(poped_bs_object)
            unique_text_line_list = []
            for text_line in poped_page_text_list:
                if text_line not in clean_excluded_txt_l_list:
                    unique_text_line_list.append(text_line)
                else:
                    pass

            #counting lengths of texts for sorting and writing text to file
            symbols_written_to_page = c_m.l_of_l_write(
                unique_text_line_list, self.text_folder_path,
                page_text_f_name)  # write_to_file
            self.page_list[ind].append(str(symbols_written_to_page))

            print self.page_list[ind][2], "DONE"

        c_m.l_of_l_write(self.page_list,
                         self.page_list_path)  #write new page list
示例#6
0
    def veri_good_b(self):
        """ Resets second try states,
        repopulates tabs with next site data,
        removes current site link from veri_ready and
        adds to veri_good file
        highlights newly added text"""

        veri_good_site_status = "VERI_GOOD"

        print "IN: veri_good_b. VERI GOOD BUTTON CLICKED"
        print "veri_good_b: REPOPULATING TABS WITH NEXT SITE DATA"

        #veri_good_b: RESETTING SECOND TRY MARKERS"
        self.eng_b_second_try = False  # clearing english button tries
        self.original_b_second_try = False  # clearing original button tries

        old_tab_data_set = self.init_tabs_data_set_list
        #veri_good_b: DELETING OLD TABS
        self.delete_tabs(old_tab_data_set)
        self.veri_ready_list[0][1] = veri_good_site_status
        current_site_data_entry = self.veri_ready_list[0]

        #veri_good_b: ADDING CLICKED AS GOOD SITE DATA ENTRY TO VERI GOOD FILE
        c_m.txt_file_append(current_site_data_entry, self.very_good_f_path)

        #veri_good_b: GOING TO THE NEXT SITE DATA ENTRY IN VERY READI LIST
        self.veri_ready_list.pop(0)
        next_site_data_entry = self.veri_ready_list[0]

        #veri_good_b: LOADING NEXT SITE TABS
        #veri_good_b: using global init_tabs_data_set_list parameter to delete tabs after button click
        self.init_tabs_data_set_list = self.mk_tabs_data_set(
            next_site_data_entry)
        self.laod_tabs(self.init_tabs_data_set_list, self.eng_text)

        print "veri_good_b: HIGHLIGHTING NEW SITE KEYWORDS"
        self.highlight_keys_in_site(self.test_hi_key_list)

        #veri_good_b: UPDATING VERI READY FILE WITH FIRST DELETED ENTRY VERI READY LIST
        c_m.l_of_l_write(self.veri_ready_list, self.very_ready_f_path)
示例#7
0
    def write_key_input_field(self, list_to_write, *arg):
        #write_key_input_field: failsafing input list
        no_empty = []
        for item in list_to_write:
            no_white_space = item.strip()  # stripping white spaces characters
            if len(no_white_space) > 1:
                no_empty.append(no_white_space)
            else:
                arg = True  # if found at least one empty item setting to rewriting mode
                print "write_key_input_field: EMPTY, WHITE SPACE, OR ONE SYMBOL KEY PRESENT IN THE KEYLIST"

        if arg:  #write_key_input_field rewriting mode
            self.input_as_text.delete('1.0', END)
            as_text = unicode("\n".join(no_empty))
            #write_key_input_field updating global keylist and file keylist with corected version
            self.init_h_key_list[0] = no_empty
            c_m.l_of_l_write(self.init_h_key_list, self.all_hlights_f_path)
        else:
            as_text = unicode("\n".join(no_empty) + "\n")

        self.input_as_text.insert(END, as_text)
        return no_empty
    def drive_input_links(self):
        """ DRIVES INPUT LINKS AND MODULES,
            FOR PARTIAL FUNCTIONALITY COMMENT OUT MARKED MODULES"""
        # driving links
        input_links_nested_list = [[], []]

        while len(input_links_nested_list) > 0:
            # retieving input_link_list
            input_links_nested_list = c_m.l_of_l_read(self.link_input_path)
            try:
                url = input_links_nested_list[0][
                    1]  #input link in nested list type data table.
                # will raise out of range error if empty input link file is provided.
            except IndexError:
                print "drive_input_links: IndexError: MOST LIKELY INPUT FILE IS EMPTY, ADD SOME SITE LINKS"
                break

            print "drive_input_links: INPUT URL:", url

            # driving modules
            site_selector = Common_Paths(url)  # System module

            # ---COMMENT OUT if you do not need SITE LINK MANAGEMENT AND INPUT URL DUBLI CHECK:
            # site_link_manager_sys = Site_Link_Manager(url)
            # is_duplicate = site_link_manager_sys.check_if_duplicate()
            # if is_duplicate:
            # print "drive_input_links: INPUT SITE LINK IS DUPLICATE, TAKING NEXT LINK"
            # input_links_nested_list.pop(0)
            # c_m.l_of_l_write(input_links_nested_list, self.link_input_path)
            # continue
            # print "drive_input_links: DUBLI CHECK DONE"
            #///

            # ---COMMENT OUT if you do not need HTML RETRIEVAL
            html_maker = HTML_File_Maker(url)
            site_bs_object_dict = html_maker.mk_htmls(
            )  # output html bs_object dict for next module
            if site_bs_object_dict is False:
                failed_entry = ["FAILED", url]
                c_m.txt_file_append(failed_entry, self.site_failed_path)
                input_links_nested_list.pop(0)
                continue
            print "drive_input_links: HTML FILES DONE BS OBJECTS READY"
            #///

            # ---COMMENT OUT if you do not need TEXT EXTRACTION
            txt_maker = Text_Extractor(url)
            txt_maker.mk_text_files(site_bs_object_dict)
            print "drive_input_links: TEXT FILE MAKER DONE"
            #///

            # ---COMMENT OUT if you do not need ENGLISH TRANSLATION
            translator = Translator(url)
            translator.mk_eng_txt_files(rewrite=False)
            print "drive_input_links: TRANSLATOR DONE"
            #///

            # ---COMMENT OUT do not need VERI_GOOD FILE UPDATE.
            # print "drive_input_links: ADDING TO VERI GOOD FILE"
            # site_link_manager_sys.add_to_veri_good()
            # print "drive_input_links: LINK DONE, POPING..."
            #///
            """ !!! POP INPUT LINK AND UPDATE INPUT LINK FILE NECESARRY FOR NON ETERNAL LOOP """
            input_links_nested_list.pop(0)
            print "drive_input_links: UPDATING INPUT LINK DATA"
            c_m.l_of_l_write(input_links_nested_list, self.link_input_path)
            """ /// """
示例#9
0
    def mk_eng_txt_files(self):
        """ Takes a list of texts parts feeds to translate,
        gets the translated text,
        stacks it together"""

        self.page_list_path
        self.text_folder_path
        main_path = 'E:\\Python_work_files\\Projects\\site_data'
        main_list_f_name = "main_list.txt"
        new_page_list = []

        print "RETRIEVING MAIN_LIST.........."
        main_list = c_m.l_of_l_read(self.page_list_path)

        # iterating throug unique text per page txts
        for data_set in main_list:

            p_text_f_name = data_set[2]
            print "TRANSLATING TEXT FROM FILE %s" % p_text_f_name

            eng_p_text_f_name = "eng_" + p_text_f_name
            error_eng_p_text_f_name = "error_" + eng_p_text_f_name

            page_text = c_m.simply_read(self.text_folder_path, p_text_f_name)

            # if page has less than 10 symbols it is not translated
            if len(page_text) < 10:
                c_m.simply_write(page_text, self.text_eng_folder_path,
                                 eng_p_text_f_name)
                continue

            # initial text output value for while loop
            text_output = ""

            # loop safety paramters
            track_while_loops = 0
            max_while_loops = 5

            # sleep times for not to abuse translate and for every part to work:
            ext_get_sleep = 2
            text_translate_sleep = 1
            text_paste_sleep = 1
            text_copy_to_clip_sleep = 1

            # big while cycle to submit input as long as no output is generated
            while len(text_output) == 0:
                # eternal loop safeguard
                if track_while_loops > max_while_loops:
                    # raise error
                    print "MAXIMUM TRIES TO TO TRANSLATE THE SAMPLE EXEEDED \npath: %s\\%s\\eng_TXT\\%s" % (
                        main_path, self.domain, error_eng_p_text_f_name)
                    error_massage = "error when translating page"
                    c_m.simply_write(error_massage, self.text_eng_folder_path,
                                     error_eng_p_text_f_name)

                # getting translate popup as html page:
                self.driver.get(
                    "chrome-extension://aapbdbdomjkkjkaonfhkkikfgjllcleb/popup.html"
                )
                print "SLEEPING %d seconds, after EXTENSION GET" % ext_get_sleep
                time.sleep(ext_get_sleep)

                print "POPULATING CLIPBOARD with text from file...."
                pyperclip.copy(page_text)
                print "Sleeping after copying to clipboard for %d s" % text_copy_to_clip_sleep
                time.sleep(text_copy_to_clip_sleep)

                # Finding text input element
                text_input_el = self.driver.find_element_by_id('text-input')
                #sending command via selenium Keys
                text_input_el.send_keys(Keys.CONTROL, 'v')
                print "Sleeping after pasting for %d" % text_paste_sleep
                time.sleep(text_paste_sleep)
                #submit to translate
                print "Pressing return"
                text_input_el.send_keys(Keys.RETURN)

                text_output_tries = 0  # initial number of small while loop tries

                # skips find output element and take text when trying for the first time
                if track_while_loops == 0 and eng_p_text_f_name == "eng_page_0_text.txt":
                    track_while_loops += 1
                    continue

                # small while cycle waiting for input to be processed
                while text_output_tries < 5 and not text_output:
                    text_output_tries += 1
                    print "sleeping after submition for %d s. Waiting for text to be translated" % text_translate_sleep
                    time.sleep(text_translate_sleep)

                    #find output element, take text
                    try:
                        text_output_el = self.driver.find_element_by_xpath(
                            '//*[@id="translation"]/div[6]')
                        text_output = text_output_el.text
                        print "TRANS_TEST_1: input sample\n%r" % page_text
                        print "TRANS_TEST_1: text_sample_ouput\n%r" % text_output
                    except:
                        print "Trying for %d time - THE TRANSLATED SAMPLE WAS NOT GENERATED" % text_output_tries

            print "WRITING TRANSLATED OUTPUT TO FILE: ", eng_p_text_f_name
            c_m.simply_write(text_output, self.text_eng_folder_path,
                             eng_p_text_f_name)
            data_set.append(eng_p_text_f_name)
            new_page_list.append(data_set)
            track_while_loops += 1  # incrementing tries
        print "DONE TRANSLATING SITE %s " % self.domain
        print "UPDATING PAGE LIST WITH ENG TEXT FILE NAMES"
        c_m.l_of_l_write(new_page_list, self.page_list_path)
        self.driver.quit()  # working chrome window closing