Exemplo n.º 1
0
def parse_page(html_doc, sent_hist, download_audio):
    soup = BeautifulSoup(html_doc)

    sentence_sets = soup.find_all('div', attrs={'class': 'sentences_set'})

    lst_sentence = []

    if sentence_sets != None:
        for i in range(0, len(sentence_sets)):
            print(i)
            all_div = sentence_sets[i].find_all('div')
            sentence_id = all_div[0].get('data-sentence-id')

            #print all_div[0].find('a', attrs={'title': 'Play audio'})

            if not sentence_id in sent_hist:
                lst_sentence.append({})
                lst_sentence[-1][u'id'] = sentence_id

                # Add translation list.
                lst_sentence[-1][u'translations'] = []
                # indirectTranslation = Bad
                all_translations = sentence_sets[i].find_all('div', attrs={'class': 'directTranslation'})
                
                if len(all_translations) == 0:
                    lst_sentence.pop()
                else:
                    for j in range(0, len(all_translations)):
                        translation = all_translations[j].find('div', attrs={'class': 'text'}).get_text()
                        lst_sentence[-1][u'translations'].append(translation)

                    main_sentence = all_div[0].find('div', attrs={'class': 'text'}).get_text()
                    lst_sentence[-1][u'main_sentence'] = main_sentence

                    if download_audio == True:
                        try:
                            audio_link = all_div[0].find('a', attrs={'title': 'Play audio'}).get('href')
                            lst_sentence[-1][u'audio_link'] = audio_link

                            audio_file = audio_link.split('/')[-1]
                            lst_sentence[-1][u'audio_file'] = audio_file
                        except:
                            audio_link = None
                            audio_file = None
                            lst_sentence[-1][u'audio_link'] = None
                            lst_sentence[-1][u'audio_file'] = None
                    else:
                            audio_link = None
                            audio_file = None
                            lst_sentence[-1][u'audio_link'] = None
                            lst_sentence[-1][u'audio_file'] = None

                    if audio_link != None:
                        audio_content = downloadPage.download(audio_link)

                        f = open('audio/' + audio_file, 'w')
                        f.write(audio_content)
                        f.close()

    return(lst_sentence)
Exemplo n.º 2
0
import whois

from downloadPage import download
from downloadSitemap import crawl_sitemap

download("https://www.varzesh3.com/")
crawl_sitemap('http://example.webscraping.com/sitemap.xml')
# print(whois.whois("www.varzesh3.com"))
Exemplo n.º 3
0
        f = codecs.open(filename, 'r', 'utf-8')
        word_history = f.read().split()
        f.close()

    for i in range(0, len(wordlist)):
        strOut = str(i) + '/' + str(len(wordlist))
        print(strOut)
        print(wordlist[i])
        if not unicode(wordlist[i]) in word_history:
            # Don't look up numbers.
            try:
                float(unicode(wordlist[i]))
            except:
                url = u'http://tatoeba.org/eng/sentences/search?query=' + unicode(wordlist[i]) + u'&from=' + sentence_lang + u'&to=' + translation_lang + u'&orphans=no&unapproved=no&native=yes&user=&tags=&has_audio=' + audio + '&trans_filter=limit&trans_to=eng&trans_link=&trans_user=&trans_orphan=no&trans_unapproved=no&trans_has_audio=&sort=words'
                print(url)
                html_doc = downloadPage.download(url)

                word_history.append(unicode(wordlist[i]))

                if html_doc == 'Error':
                    print('Couldn\'t connect.')
                    exit(1)
                else:
                    lst_sentences = parse_page(html_doc, sentence_history, download_audio)

                    if len(lst_sentences) > 0:
                        str_csv = u''
                        for j in range(0, len(lst_sentences)):
                            # Don't write empty sentences.
                            str_row = unicode(csv_row(lst_sentences[j]))
                            if len(str_row.split('|')[1]):