Пример #1
0
def wiki_parse(flag = False):
    a = urllib.request.urlopen("https://ru.wikipedia.org/wiki/%D0%92%D0%B8%D0%BA%D0%B8%D0%BF%D0%B5%D0%B4%D0%B8%D1%8F:%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D1%81%D1%82%D0%B0%D1%82%D0%B5%D0%B9,_%D0%BA%D0%BE%D1%82%D0%BE%D1%80%D1%8B%D0%B5_%D0%B4%D0%BE%D0%BB%D0%B6%D0%BD%D1%8B_%D0%B1%D1%8B%D1%82%D1%8C_%D0%B2%D0%BE_%D0%B2%D1%81%D0%B5%D1%85_%D1%8F%D0%B7%D1%8B%D0%BA%D0%BE%D0%B2%D1%8B%D1%85_%D0%B2%D0%B5%D1%80%D1%81%D0%B8%D1%8F%D1%85")
    a = a.read()
    a = bs.BeautifulSoup(a,'lxml')
    a = a.find_all('a')
    urls = []
    for i in a:
        if i.text != "":
            urls += [(i.text,i.get('href'))]
    urls = urls[73:-245]
    output = []
    for i in range(0,998):
        scrapped_data = urllib.request.urlopen('https://ru.wikipedia.org/'+urls[i][1])
        article = scrapped_data.read()
        parsed_article = bs.BeautifulSoup(article,'lxml')

        paragraphs = parsed_article.find_all('p')
        print(i,end=" ")

        article_text = ""

        for p in paragraphs:
            article_text += p.text

        processed_article = article_text
        processed_article = format_sentence(processed_article)
        output.append(processed_article)
        if flag:
            try:
                f = open("corpus/wikis/wiki" + str(i) + ".txt", "w+", encoding='utf-8')
                f.write(urls[i][0]+'\n'+processed_article)
                f.close()
            except:
                pass
    return output
Пример #2
0
def xml_parse():
    output = []
    for i in range(2, 4070):
        try:
            tree = ET.parse('corpus/opcorpora_xml/{}.xml'.format(i))
            root = tree.getroot()
            for paragraph in root[1]:
                output.append(format_sentence(paragraph[0][0].text))  # text (name) -> paragraphs -> paragraph -> sentence -> source
        except:
            pass
    return output
Пример #3
0
def books_parse():
    filenames = os.listdir(path="corpus/ru_books")
    output = []
    for file in filenames:
        try:
            inputFile = codecs.open('corpus/ru_books/' + file,
                                    'r',
                                    encoding='cp1251')
            book = inputFile.read()
            book = format_sentence(book)
            output += re.split('; |!|\.|\?', book)
            print(file)
        except:
            print("ERROR", file)
    return output
Пример #4
0
def twits_parse():
    f = open("corpus/positive.sql", encoding="utf-8")
    output = []
    for i in range(47):  # 47
        f.readline()
    for i in range(48, 72):  # 48,72
        line = f.readline()
        while True:
            line = line[str.find(line, "(") + 1:]
            line = skip_comma(line, 3)
            index = str.find(line, "'")
            twit = line[:index]
            twit = format_sentence(twit)
            if twit == "" or str.find(twit, ", ) ;") != -1:
                break
            output.append(twit)
    f.close()
    return output
Пример #5
0
def get_books(flag = False):
    a = requests.get("http://www.gutenberg.org/browse/scores/top")
    tree = html.fromstring(a.content.decode())
    nums_for_url = tree.xpath('//li/a')
    temp = [None]*100
    for ind in range(0,100):
        temp[ind] = nums_for_url[ind].get("href")
        temp[ind] = temp[ind][temp[ind].find("/", 1) + 1:]
    nums_for_url = temp
    urls = ["http://www.gutenberg.org/cache/epub/"+str(i)+"/pg"+str(i)+".txt" for i in nums_for_url]
    n = 0
    output = []
    for url in urls:
        book = format_sentence(requests.get(url).content.decode(),"en")
        if flag:
            f = open("corpus/en_books/book" + str(n) + ".txt", 'w', encoding='utf-8')
            print(n, end=" ")
            n += 1
            f.write(book)
            output += re.split('; |!|\.|\?', book)
            f.close()
    return output