def write_h2h(): root = 'h2h/' for date_i in range(len(dates) + 1): for i in range(len(EastTeams)): index = 0 qij = np.array([0] * len(EastTeams)) for j in range(len(EastTeams)): if i != j: qij[index] = h2h(date_i, i, j, "east") index += 1 np.savetxt(root + str(date_i) + '_' + EastTeams[i] + '.dat', qij, delimiter=' ') for i in range(len(WestTeams)): index = 0 qij = np.array([0] * len(WestTeams)) for j in range(len(WestTeams)): if i != j: qij[index] = h2h(date_i, i, j, "west") index += 1 np.savetxt(root + str(date_i) + '_' + WestTeams[i] + '.dat', qij, delimiter=' ') hlp.printProgress(date_i, len(dates) - 1, prefix='Progress:', barLength=50)
def downloadStock(company, date_from, date_to): company = str(company) if company == 'sberbank': code = 'SBER' em = '3' elif company == 'gazprom': code = 'GAZP' em = '16842' elif company == 'dm': code = 'DSKY' em = '473181' dfs = date_from.split('/') df = dfs[0].lstrip('0') mf = str(int(dfs[1].lstrip('0')) - 1) yf = dfs[2] datef = dfs[0] + '.' + dfs[1] + '.' + dfs[2] dts = date_to.split('/') dt = dts[0].lstrip('0') mt = str(int(dts[1].lstrip('0')) - 1) yt = dts[2] datet = dts[0] + '.' + dts[1] + '.' + dts[2] cn = company print('Downloading stocks...') sys.stdout.flush() url = '{}&code={}&apply=0&df={}&mf={}&yf={}&from={}&dt={}&mt={}&yt={}&to={}&p=8&f=stock_1&e=.txt&cn={}&dtf=4&tmf=3&MSOR=1&mstime=on&mstimever=1&sep=1&sep2=1&datf=5&at=1'.format( em, code, df, mf, yf, datef, dt, mt, yt, datet, cn) stocks_dates = [] stocks = [] stocks_count = 0 data = urlopen(url).read().decode("utf-8").split('\r\n') for i in range(1, len(data) - 1): printProgress(stocks_count, len(data) - 1) item_split = data[i].split(',') stocks_dates.append(item_split[0]) stocks.append(item_split[2]) stocks_count += 1 printProgress(stocks_count, stocks_count, True) print('Done!') sys.stdout.flush() return stocks_dates, stocks, stocks_count
def stem(news_dates, news, news_count): stems_dates = [] [ stems_dates.append(date) for date in news_dates if date not in stems_dates ] stems = [] stems_count = len(stems_dates) i = 0 j = 0 print('Stemming news...') sys.stdout.flush() while i < stems_count: stem = [] printProgress(i, stems_count) while j < news_count and stems_dates[i] == news_dates[j]: words = text_to_word_sequence( news[j], filters=''.join(punctuation) + '–—01234567890abcdefghijklmnopqrstuvwxyz') for word in words: if word not in stemmer.stopwords and word != ' ': stem.append(stemmer.stem(word)) j += 1 i += 1 stems.append(' '.join(stem)) printProgress(stems_count, stems_count, True) print('Done!') sys.stdout.flush() return stems_dates, stems, stems_count
def write_gamesleft(): root = 'gamesleft/' for date_i in range(161, len(dates) + 1): for i in range(len(EastTeams)): index = 0 gij = np.array([0] * (len(EastTeams) + len(WestTeams))) for j in range(len(EastTeams)): if i != j: gij[index] = gamesleft(date_i, i, "east", j, "east") index += 1 for j in range(len(WestTeams)): gij[index] = gamesleft(date_i, i, "east", j, "west") index += 1 np.savetxt(root + str(date_i) + '_' + EastTeams[i] + '.dat', gij, delimiter=' ') for i in range(len(WestTeams)): index = 0 gij = np.array([0] * (len(EastTeams) + len(WestTeams))) for j in range(len(WestTeams)): if i != j: gij[index] = gamesleft(date_i, i, "west", j, "west") index += 1 for j in range(len(EastTeams)): gij[index] = gamesleft(date_i, i, "west", j, "east") index += 1 np.savetxt(root + str(date_i) + '_' + WestTeams[i] + '.dat', gij, delimiter=' ') hlp.printProgress(date_i, len(dates) - 1, prefix='Progress:', barLength=50)
def downloadNews(company, amount): domain = '' news_dates = [] news = [] news_count = 0 if company == 'sberbank': company = '1' elif company == 'gazprom': company = '3' amount = int(amount) sys.stdout.flush() trs = getTrs(company, amount) total = len(trs) current = 0 print('Downloading news...') while current < total: try: printProgress(current, total) td = trs[current].findAll('td') temp_date = td[0].getText().split(',')[0].strip() if temp_date == 'сегодня': today = item_date = today.strftime('%d/%m/%y') elif temp_date == 'вчера': yesterday = - datetime.timedelta(1) item_date = yesterday.strftime('%d/%m/%y') else: temp_date_split = temp_date.split('.') item_date = '{}/{}/{}'.format(str(temp_date_split[0]), str(temp_date_split[1]), str(temp_date_split[2][2:])) item_url = domain + td[1].find('a').get('href') item_page = urlopen(item_url) item_bs = BeautifulSoup(item_page, 'html.parser') item_single = item_bs.find('div', { 'class': 'mfd-related-companies' }).findAll('a') item_data = item_bs.find('div', { 'class': 'm-content' }).findAll('p') item_string = '' for j in range(1, len(item_data) - 2): item_string += item_data[j].getText() + ' ' item_string = item_string.strip() if item_string != '' and len(item_single) == 1: news_dates.append(item_date) news.append(item_string) news_count += 1 current += 1 time.sleep(delay) except: time.sleep(delay_except) printProgress(total, total, True) print('Done!') sys.stdout.flush() return news_dates[::-1], news[::-1], news_count