def wiki_parse(flag = False): a = urllib.request.urlopen("https://ru.wikipedia.org/wiki/%D0%92%D0%B8%D0%BA%D0%B8%D0%BF%D0%B5%D0%B4%D0%B8%D1%8F:%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D1%81%D1%82%D0%B0%D1%82%D0%B5%D0%B9,_%D0%BA%D0%BE%D1%82%D0%BE%D1%80%D1%8B%D0%B5_%D0%B4%D0%BE%D0%BB%D0%B6%D0%BD%D1%8B_%D0%B1%D1%8B%D1%82%D1%8C_%D0%B2%D0%BE_%D0%B2%D1%81%D0%B5%D1%85_%D1%8F%D0%B7%D1%8B%D0%BA%D0%BE%D0%B2%D1%8B%D1%85_%D0%B2%D0%B5%D1%80%D1%81%D0%B8%D1%8F%D1%85") a = a.read() a = bs.BeautifulSoup(a,'lxml') a = a.find_all('a') urls = [] for i in a: if i.text != "": urls += [(i.text,i.get('href'))] urls = urls[73:-245] output = [] for i in range(0,998): scrapped_data = urllib.request.urlopen('https://ru.wikipedia.org/'+urls[i][1]) article = scrapped_data.read() parsed_article = bs.BeautifulSoup(article,'lxml') paragraphs = parsed_article.find_all('p') print(i,end=" ") article_text = "" for p in paragraphs: article_text += p.text processed_article = article_text processed_article = format_sentence(processed_article) output.append(processed_article) if flag: try: f = open("corpus/wikis/wiki" + str(i) + ".txt", "w+", encoding='utf-8') f.write(urls[i][0]+'\n'+processed_article) f.close() except: pass return output
def xml_parse(): output = [] for i in range(2, 4070): try: tree = ET.parse('corpus/opcorpora_xml/{}.xml'.format(i)) root = tree.getroot() for paragraph in root[1]: output.append(format_sentence(paragraph[0][0].text)) # text (name) -> paragraphs -> paragraph -> sentence -> source except: pass return output
def books_parse(): filenames = os.listdir(path="corpus/ru_books") output = [] for file in filenames: try: inputFile = codecs.open('corpus/ru_books/' + file, 'r', encoding='cp1251') book = inputFile.read() book = format_sentence(book) output += re.split('; |!|\.|\?', book) print(file) except: print("ERROR", file) return output
def twits_parse(): f = open("corpus/positive.sql", encoding="utf-8") output = [] for i in range(47): # 47 f.readline() for i in range(48, 72): # 48,72 line = f.readline() while True: line = line[str.find(line, "(") + 1:] line = skip_comma(line, 3) index = str.find(line, "'") twit = line[:index] twit = format_sentence(twit) if twit == "" or str.find(twit, ", ) ;") != -1: break output.append(twit) f.close() return output
def get_books(flag = False): a = requests.get("http://www.gutenberg.org/browse/scores/top") tree = html.fromstring(a.content.decode()) nums_for_url = tree.xpath('//li/a') temp = [None]*100 for ind in range(0,100): temp[ind] = nums_for_url[ind].get("href") temp[ind] = temp[ind][temp[ind].find("/", 1) + 1:] nums_for_url = temp urls = ["http://www.gutenberg.org/cache/epub/"+str(i)+"/pg"+str(i)+".txt" for i in nums_for_url] n = 0 output = [] for url in urls: book = format_sentence(requests.get(url).content.decode(),"en") if flag: f = open("corpus/en_books/book" + str(n) + ".txt", 'w', encoding='utf-8') print(n, end=" ") n += 1 f.write(book) output += re.split('; |!|\.|\?', book) f.close() return output