def parse_page(html_doc, sent_hist, download_audio): soup = BeautifulSoup(html_doc) sentence_sets = soup.find_all('div', attrs={'class': 'sentences_set'}) lst_sentence = [] if sentence_sets != None: for i in range(0, len(sentence_sets)): print(i) all_div = sentence_sets[i].find_all('div') sentence_id = all_div[0].get('data-sentence-id') #print all_div[0].find('a', attrs={'title': 'Play audio'}) if not sentence_id in sent_hist: lst_sentence.append({}) lst_sentence[-1][u'id'] = sentence_id # Add translation list. lst_sentence[-1][u'translations'] = [] # indirectTranslation = Bad all_translations = sentence_sets[i].find_all('div', attrs={'class': 'directTranslation'}) if len(all_translations) == 0: lst_sentence.pop() else: for j in range(0, len(all_translations)): translation = all_translations[j].find('div', attrs={'class': 'text'}).get_text() lst_sentence[-1][u'translations'].append(translation) main_sentence = all_div[0].find('div', attrs={'class': 'text'}).get_text() lst_sentence[-1][u'main_sentence'] = main_sentence if download_audio == True: try: audio_link = all_div[0].find('a', attrs={'title': 'Play audio'}).get('href') lst_sentence[-1][u'audio_link'] = audio_link audio_file = audio_link.split('/')[-1] lst_sentence[-1][u'audio_file'] = audio_file except: audio_link = None audio_file = None lst_sentence[-1][u'audio_link'] = None lst_sentence[-1][u'audio_file'] = None else: audio_link = None audio_file = None lst_sentence[-1][u'audio_link'] = None lst_sentence[-1][u'audio_file'] = None if audio_link != None: audio_content = downloadPage.download(audio_link) f = open('audio/' + audio_file, 'w') f.write(audio_content) f.close() return(lst_sentence)
import whois from downloadPage import download from downloadSitemap import crawl_sitemap download("https://www.varzesh3.com/") crawl_sitemap('http://example.webscraping.com/sitemap.xml') # print(whois.whois("www.varzesh3.com"))
f = codecs.open(filename, 'r', 'utf-8') word_history = f.read().split() f.close() for i in range(0, len(wordlist)): strOut = str(i) + '/' + str(len(wordlist)) print(strOut) print(wordlist[i]) if not unicode(wordlist[i]) in word_history: # Don't look up numbers. try: float(unicode(wordlist[i])) except: url = u'http://tatoeba.org/eng/sentences/search?query=' + unicode(wordlist[i]) + u'&from=' + sentence_lang + u'&to=' + translation_lang + u'&orphans=no&unapproved=no&native=yes&user=&tags=&has_audio=' + audio + '&trans_filter=limit&trans_to=eng&trans_link=&trans_user=&trans_orphan=no&trans_unapproved=no&trans_has_audio=&sort=words' print(url) html_doc = downloadPage.download(url) word_history.append(unicode(wordlist[i])) if html_doc == 'Error': print('Couldn\'t connect.') exit(1) else: lst_sentences = parse_page(html_doc, sentence_history, download_audio) if len(lst_sentences) > 0: str_csv = u'' for j in range(0, len(lst_sentences)): # Don't write empty sentences. str_row = unicode(csv_row(lst_sentences[j])) if len(str_row.split('|')[1]):