def parse(self, response): satisfied = False exitvalve = 0 while not satisfied: exitvalve += 1 if exitvalve > 1000: raise CloseSpider(loopbreak) break n = 0 exlist = response.xpath( '//*/div[@class="additional-entry"]/div[@class="col2"]/div[@class="text-to-speech"]/@data-text' ).getall() exlist += response.xpath( '//*[@id="inner-content"]/section[@class="more example external-example"]/descendant::*/div[@class="col2"]/div/span/text()' ).getall() trlist = response.xpath( '//*/div[@class="additional-entry"]/div[@class="col1"]/div[2]/div[@class="text-to-speech"]/@data-text' ).getall() trlist += response.xpath( '//*[@id="inner-content"]/section[@class="more example external-example"]/descendant::*/div[@class="col1"]/div[@class="trans-line"]/div/text()' ).getall() trlist = [text.strip() for text in trlist] eyerelief() if len(exlist) == 0: print( f'{colors.warning("ERROR - NO RESULTS RETURNED; SKIPPING" + response.url[44:])}' ) self.faillist.append(response.url[44:]) break for ex, tr in zip(exlist, trlist): print(f'{n}\n{colors.bluetext(ex)}\n{colors.information(tr)}') n += 1 eyerelief() userchoice = input( colors.prompt( 'Enter the numbers of the examples above you would like' 'to save, separated by commas:\n')) exitvalve2 = 0 while not is_valid_list(userchoice, exlist) or not is_valid_list( userchoice, trlist): exitvalve2 += 1 if exitvalve2 > 1000: raise CloseSpider(loopbreak) break userchoice = input( colors.warning( 'Invalid choice. Please enter the numbers of the ' 'examples you would like to save separated by ' 'commas:\n')) userchoice = userchoice.split(',') userchoice = [int(s) for s in userchoice] userchoice = set(userchoice) print(colors.prompt("you selected:")) for num in userchoice: print( colors.parrot(f'{num}\n') + colors.bluetext(f'{exlist[num]}\n') + colors.information(f'{trlist[num]}\n')) if yesno_prompt( colors.prompt('Is that correct? y/n:\n'), colors.warning('Invalid entry. Please enter y or n:\n')): satisfied = True with open('examples.txt', 'a') as output: for num in userchoice: output.write(f'{exlist[num]}|{trlist[num]}\n')
def parse(self, response): output_list = [] target_word = urllib.parse.unquote(response.url)[43:] examples = response.xpath( "//div[@class='v2-sentence-box'][not(@style='display: inline-block;')][not(@style='height: 317px; padding-bottom: 25px; padding-top: 5px; display: inline-block;')]" ).getall() for verbose_example in examples: ''' further refine selection, as div contains mostly unwanted content. Div looks like this <div class="v2-sentence-box"> Прошло время, <b>птенцы</b> выросли и улетели... <div class="v2-sentence-source"> <a href="/книги/Крайон_Сказки_рассказы_притчи_для_больших и_маленьких/5#p53" onclick="plantHlTag('/книги/ Крайон_Сказки_рассказы_притчи_для_больших_и_маленьких/ 5#p53', 5616462)"> Крайон, Сказки, рассказы, притчи для больших и маленьких</a> </div> </div> We want to remove everything after <div class='v2-setence-source'> and before the example ''' target = verbose_example.split('<div class="v2-sentence-source">', 1)[0] target = target.split('\n', 1)[1].strip() + '\n' target = target.replace('<b>', '') target = target.replace('</b>', '') output_list.append(target) output_list.sort(key=lambda s: len(s), reverse=True) if len(output_list) < 1: print() print() print( colors.warning(f"ERROR: NO EXAMPLES FOUND FOR {target_word}!")) print(colors.warning("Did you misspell the word?")) print(colors.warning(f"Skipping {target_word}.")) print() print() pass else: print(divider1) print(divider1) satisfied = False iters = 0 while not satisfied: iters += 1 if iters > 1000: raise CloseSpider( 'Maxmimum iterations exceeded; while loop broken') break for num, sentence in enumerate(output_list): if num % 2 == 0: print(colors.parrot(f'{num} - {sentence[:-1]}')) else: print(f'{num} - {sentence[:-1]}') print() userchoice = input( colors.prompt( 'Enter the numbers of the examples above which ' 'you would like to save, separated by commas: ')) iter4 = 0 while not is_valid_list(userchoice, output_list): iter4 += 1 if iter4 > 1000: raise CloseSpider( 'Maxmimum iterations exceeded; while loop broken') break userchoice = input( colors.warning( 'Invalid choice. Please enter the numbers of the ' 'examples you would like to save separated by ' 'commas: ')) userchoice = userchoice.split(',') userchoice = [int(s) for s in userchoice] userchoice = set(userchoice) # ask user to confirm print(colors.prompt("you selected:")) for num in userchoice: print(colors.parrot(f'{num} - {output_list[num]}')) if yesno_prompt( colors.prompt('Is that correct? y/n: '), colors.warning( 'Invalid entry. Please enter y or n: ')): satisfied = True for num in userchoice: print() print(output_list[num]) translation = input( colors.prompt( "Please enter a translation for the sentence above: ")) satisfied = False iters3 = 0 while not satisfied: iters3 += 1 if iters3 > 1000: raise CloseSpider( 'Maxmimum iterations exceeded; while loop broken') break print() print(colors.prompt("you entered:")) print() print(colors.parrot(translation)) if yesno_prompt( 'Is that correct? y/n: ', 'Invalid selection. Please enter y or n: '): satisfied = True else: print(output_list[num]) translation = input( colors.prompt( 'Please enter a translation for the sentence ' 'above: ')) l = ItemLoader(item=WordspiderItem(), response=response) l.add_value('example', output_list[num][:-1]) l.add_value('translation', translation) yield l.load_item()
def getturkishwords(): """ validates the contents of the list of turkish words to scrape and runs the spider to scrape them """ toscrape = True banner_fig = Figlet(font='banner3-D', width=120) warning_fig = Figlet(font='xcourb', width=120) print(colors.information(banner_fig.renderText('Turkish Scraper'))) if not os.path.exists('toscrape.txt'): print( colors.warning( warning_fig.renderText('Warning: toscrape.txt not found'))) print( colors.warning('In order to scrape example sentences, you must ' 'first create a file in the same directory as ' 'getwords.py that contains a list of single ' 'Turkish words, each on its own line.')) toscrape = False else: with open('toscrape.txt') as f: content = f.readlines() warning1 = "toscrape.txt improperly formatted" warning2 = ( "It appears that toscrape.txt is improperly formatted." " the file should contain only single Turkish words on" " new lines. Exiting.") for item in content: if not validate_line(item.replace('\n', '')): toscrape = False print(colors.warning(warning1 + '\n' + warning2)) break if toscrape: with open('toscrape.txt') as f: print( colors.prompt( "toscrape.txt appears to be valid and contains the " "words listed below.")) for l in f: print(colors.information(l.replace('\n', ''))) print() proceed = input( colors.prompt( 'would you like to proceed with these entries? y/n:\n')) safetyvalve3 = 0 while len(proceed) != 1 or proceed not in "yYnN": safetyvalve3 += 1 if safetyvalve3 > 1000: raise RuntimeError("Max iterations exceeded! Exiting!") break proceed = input( colors.prompt('Invalid entry. Please enter y or n\n')) if proceed in "nN": sys.exit(0) else: os.system('rm examples.txt >/dev/null 2>&1') process = CrawlerProcess(get_project_settings()) process.crawl(TurkishSpider) process.start() if len(TurkishSpider.faillist) > 0: print( colors.warning("Failed to retrieve the following words:")) for item in TurkishSpider.faillist: print(colors.warning(item))
def weave(): """ runs word spider and stress spider, generating csv files. combines information from csv files to create text file with stressed examples and tranlsations which can be uploaded to Anki """ banner_fig = Figlet(font='banner3-D', width=120) warning_fig = Figlet(font='xcourb', width=120) print(colors.information(banner_fig.renderText('VerbScraper2.0'))) # check to see if toscrape.txt exists and warn user if not if not os.path.exists('toscrape.txt'): print( colors.warning( warning_fig.renderText('Warning: toscrape.txt not found'))) print( colors.warning('In order to scrape example sentences, you must ' 'first create a file in the same directory as ' 'loom.py that contains a list of single Russian ' 'words, each on its own line. Since this file ' 'is missing, you will be prompted to enter ' 'your own examples manually.')) toscrape = False else: toscrape = True with open('toscrape.txt') as f: warning1 = "toscrape.txt improperly formatted" warning2 = ("It appears that toscrape.txt is improperly formatted." " the file should contain only single Russian words on" " new lines. Launching in manual entry only mode.") if f.read() == '': print(colors.warning(warning1 + '\n' + warning2)) toscrape = False else: for line in f: if not validate_word(line): toscrape = False print(colors.warning(warning1 + '\n' + warning2)) break if toscrape: with open('toscrape.txt') as f: print( colors.prompt( "toscrape.txt appears to be valid and contains the " "words listed below.")) for l in f: print(colors.information(l.replace('\n', '')), ) print() answer = input( colors.prompt( 'Would you like to:\n' '1 - proceed with these words (you will be prompted for ' 'optional manual entries as well)\n' '2 - proceed with manual entries only\n' 'alternatively, enter "exit" to exit.\n')) iterbreak = 0 while answer not in ["1", "2", "exit", "Exit", "EXIT"]: iterbreak += 1 if iterbreak > 1000: raise RuntimeError( "Max iterations exceded! Loop borken") break answer = input( colors.prompt( 'Invalid entry. Please enter 1, 2, or exit\n')) if answer in ["exit", "Exit", "EXIT"]: sys.exit(0) if answer == "2": toscrape = False # remove examples.csv and stresses.csv if they exist os.system('rm examples.csv >/dev/null 2>&1') os.system('rm stresses.csv >/dev/null 2>&1') if toscrape: # if the user has supplied toscrape.txt and opted to use it, run # wordspider cmd = shlex.split('scrapy crawl wordspider') p = Popen(cmd) p.wait() # prompt user for manually added examples in addition to crawled examples man_examples = gather_man_input() if not toscrape and len(man_examples['example']) < 1: raise RuntimeError('No examples provided!') if len(man_examples['example']) > 0: write_man_input(man_examples, 'examples.csv') # if len dictionary lists > 0, write dictionary lists to examples.csv cmd2 = shlex.split('scrapy crawl stressspider') p2 = Popen(cmd2) p2.wait() with open('examples.csv') as ex: with open('stresses.csv') as stresses: exreader = csv.DictReader(ex) streader = csv.DictReader(stresses) stress_dict = {} linestowrite = [] """ create master dictionary of stresses. keys are unstressed words and values are lists of potential stresses for the word """ for row in streader: if row['clean'] in stress_dict.keys(): stress_dict[row['clean']].append(row['stressed']) else: stress_dict[row['clean']] = [ row['stressed'], ] for row in exreader: tentative_text = row['example'] example_words = word_list(row['example']) for i, word in enumerate(example_words): if word in stress_dict.keys() or word.lower( ) in stress_dict.keys(): try: target_word = stress_dict[word][0] except KeyError: target_word = stress_dict[word.lower()][0] if "</font>" not in target_word.lower(): print() print( colors.prompt( f"It appears no stress was found for {word}" )) if yesno_prompt( colors.prompt( 'Would you like to enter a stress mannually? y/n: ' ), colors.warning( 'Invalid entry. Please enter y or n: ') ): user_satisfied = False iters = 0 while not user_satisfied: iters += 1 if iters > 1000: print("max iterations exceeded; break") break for letter in target_word.lower(): print(colors.parrot(f'{letter:3s}'), end=" ") print(' ') for i in range(len(target_word)): print( colors.parrot(f'{str(i + 1):3s}'), end=" ") print(' ') stress_choice = input( colors.prompt( "Please enter the number of the letter you wish to stress: " )) iters2 = 0 while not input_isvalid( stress_choice, target_word): iters2 += 1 if iters2 > 1000: print( 'Maxmimum iterations exceeded; while loop broken' ) break for letter in target_word.lower(): print( colors.parrot(f'{letter:3s}'), end=" ") print() for i in range(len(target_word)): print(colors.parrot( f'{str(i + 1):3s}'), end=" ") print() stress_choice = input( colors.warning( "Invalid entry. Please select one of the numbers listed above " )) if yesno_prompt( colors.prompt( f"You want to place the stress on '{word[int(stress_choice) - 1]}' at position {stress_choice}, correct? y/n " ), colors.warning( "Invalid entry. Please enter y or n: " )): user_satisfied = True stress_dict[target_word.lower( )][0] = man_stress( target_word.lower(), int(stress_choice) - 1) print() if len(stress_dict[word.lower()]) > 1: print() print(colors.parrot(row['example'])) print(colors.parrot(row['translation'])) print() print( colors.prompt( f'Word {i + 1} in the example above has ' f'{len(stress_dict[word.lower()])} stress options' )) print() for e, option in enumerate( stress_dict[word.lower()]): print( colors.prompt(f'{e + 1} -- ' f'{visual_stress(option)}')) print() user_in = input( colors.prompt( f'Please enter the appropriate stress ' f'for word {i +1}: ')) while not input_isvalid(user_in, stress_dict[word.lower()]): user_in = input( colors.warning( 'Invalid entry. Please enter a number ' 'corresponding to a choice above: ')) tentative_text = tentative_text.replace( word.lower(), stress_dict[word.lower()][int(user_in) - 1], 1) tentative_text = tentative_text.replace( word.capitalize(), stress_dict[word.lower()][int(user_in) - 1].capitalize(), 1) else: tentative_text = tentative_text.replace( word.lower(), stress_dict[word.lower()][0], 1) tentative_text = tentative_text.replace( word.capitalize(), stress_dict[word.lower()][0].capitalize(), 1) linestowrite.append(tentative_text + ';' + row['translation'] + '\n') cards_written = 0 with open('flashcards.txt', 'w') as f: for line in linestowrite: f.write(line) cards_written += 1 if cards_written == 1: message = (f'Success! {cards_written} card written to ' f'flashcards.txt') else: message = (f'Success! {cards_written} cards written to ' f'flashcards.txt') success_banner(message) os.system('rm examples.csv') os.system('rm stresses.csv')
def parse(self, response): satisfied = False exitvalve = 0 unwanted_text = [ '<div dir="ltr" class="span6">', '<b>', '</b>', '</div>', '<div dir="ltr">' ] while not satisfied: exitvalve += 1 if exitvalve > 1000: raise CloseSpider(loopbreak) n = 0 exlist = response.xpath( '//div[@class="examples"]/div[@class="row-fluid"]/div[@dir="ltr"]' ).getall() trlist = response.xpath( '//div[@class="examples"]/div[@class="row-fluid"]/div[@lang="en"]/div[@dir="ltr"]' ).getall() eyerelief() if len(exlist) == 0: print( f'{colors.warning("ERROR - NO RESULTS RETURNED; SKIPPING" + response.url[25:])}' ) self.faillist.append(response.url[25:]) break exlist_clean = [] trlist_clean = [] for ex, tr in zip(exlist, trlist): ex_clean = ex tr_clean = tr for unwanted in unwanted_text: ex_clean = ex_clean.replace(unwanted, '') tr_clean = tr_clean.replace(unwanted, '') exlist_clean.append(ex_clean) trlist_clean.append(tr_clean) print( f'{n}\n{colors.bluetext(ex_clean)}\n{colors.information(tr_clean)}' ) n += 1 eyerelief() userchoice = input( colors.prompt( 'Enter the numbers of the examples above you would like' 'to save, separated by commas:\n')) exitvalve2 = 0 while not is_valid_list(userchoice, exlist) or not is_valid_list( userchoice, trlist): exitvalve2 += 1 if exitvalve2 > 1000: raise CloseSpider(loopbreak) break userchoice = input( colors.warning( 'Invalid choice. Please enter the numbers of the ' 'examples you would like to save separated by ' 'commas:\n')) userchoice = userchoice.split(',') userchoice = [int(s) for s in userchoice] userchoice = set(userchoice) print(colors.prompt("you selected:")) for num in userchoice: print( colors.parrot(f'{num}\n') + colors.bluetext(f'{exlist_clean[num]}\n') + colors.information(f'{trlist_clean[num]}\n')) if yesno_prompt( colors.prompt('Is that correct? y/n:\n'), colors.warning('Invalid entry. Please enter y or n:\n')): satisfied = True with open('examples.txt', 'a') as output: for num in userchoice: output.write(f'{exlist_clean[num]}|{trlist_clean[num]}\n')