def gen_FeatureMatrix(news_file, price_file, stopWords_file, output, wordDict, dim_wordVec, sentense_len, term_type, mtype): with open(price_file) as file: print("Loading price info ...") priceDt = json.load(file)[term_type] cnt = 0 testDates = util.dateGenerator(300) os.system('rm ' + output + mtype) stopWords = set() with open(stopWords_file) as file: for word in file: stopWords.add(word.strip()) with open(news_file) as f: for line in f: line = line.strip().split(',') if len(line) != 6: continue ''' newsType: [topStory, normal] ''' ticker, name, day, headline, body, newsType = line if newsType != 'topStory': continue # skip normal news if ticker not in priceDt: continue # skip if no corresponding company found if day not in priceDt[ticker]: continue # skip if no corresponding date found cnt += 1 # if cnt > 20: continue if cnt % 1000 == 0: print("%sing samples %d" % (mtype, cnt)) if mtype == "test" and day not in testDates: continue if mtype == "train" and day in testDates: continue # 2.1 tokenize sentense, check if the word belongs to the top words, unify the format of words #headline = headline.encode('utf-8') #body = body.encode('utf-8') tokens = nltk.word_tokenize(headline) # + nltk.word_tokenize(body) tokens = map(util.unify_word, tokens) # build feature and label feature = np.zeros([0, dim_wordVec]) featureNone = True for t in tokens: # if t in stopWords: continue if t not in wordDict: continue featureNone = False feature = np.vstack((feature, np.matrix(wordDict[t]))) if featureNone: continue # feature is empty, continue feature = util.padding(feature, sentense_len) label = round(priceDt[ticker][day], 6) with open(output + mtype, 'a+') as file: np.savetxt(file, np.hstack((feature, np.matrix(label))), fmt='%.5f')
def tokenize(news_file, price_file, stopWords_file, output, sentense_len, term_type, n_vocab, mtype): # load price data with open(price_file) as file: print("Loading price info ...") priceDt = json.load(file)[term_type] testDates = util.dateGenerator( 1) # the most recent days are used for testing os.system('rm ' + output + mtype) # load stop words stopWords = set() with open(stopWords_file) as file: for word in file: stopWords.add(word.strip()) # build feature matrix word2idx = {'START': 0, 'END': 1} idx2word = ['START', 'END'] current_idx = 2 word_idx_count = {0: float('inf'), 1: float('inf')} sentences, labels = [], [] with open(news_file) as f: for num, line in enumerate(f): line = line.strip().split(',') if len(line) != 6: continue ticker, name, day, headline, body, newsType = line if newsType != 'topStory': # newsType: [topStory, normal] continue # skip normal news if ticker not in priceDt: continue # skip if no corresponding company found if day not in priceDt[ticker]: continue # skip if no corresponding date found if num % 10000 == 0: print("%sing samples %d" % (mtype, num)) if mtype == "test" and day not in testDates: continue if mtype == "train" and day in testDates: continue tokens = nltk.word_tokenize(headline) + nltk.word_tokenize(body) tokens = list(map(util.unify_word, tokens)) tokens = list(map(util.unify_word, tokens)) for t in tokens: if t in stopWords: continue if t not in word2idx: word2idx[t] = current_idx idx2word.append(t) current_idx += 1 idx = word2idx[t] word_idx_count[idx] = word_idx_count.get(idx, 0) + 1 sentence_by_idx = [ word2idx[t] for t in tokens if t not in stopWords ] sentences.append(sentence_by_idx) labels.append(round(priceDt[ticker][day], 6)) # restrict vocab size sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True) word2idx_small = {} new_idx = 0 idx_new_idx_map = {} for idx, count in sorted_word_idx_count[:n_vocab]: word = idx2word[idx] print(word, count) word2idx_small[word] = new_idx idx_new_idx_map[idx] = new_idx new_idx += 1 # let 'unknown' be the last token word2idx_small['UNKNOWN'] = new_idx unknown = new_idx # map old idx to new idx features = [] # shorter sentence idx for num, sentence in enumerate(sentences): if len(sentence) > 1: new_sentence = [ idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence ] # padding if len(new_sentence) > sentense_len: new_sentence = new_sentence[:sentense_len] else: new_sentence = new_sentence + [1] * (sentense_len - len(new_sentence)) new_sentence.append(labels[num]) features.append(new_sentence) features = np.matrix(features) print(features.shape) with open(output + mtype, 'a+') as file: np.savetxt(file, features, fmt="%s")
def tokenize(news_file, price_file, stopWords_file, output, output_wd2idx, sen_len, term_type, n_vocab, mtype): # load price data with open(price_file) as file: print("Loading price info ... " + mtype) priceDt = json.load(file) testDates = util.dateGenerator( 20) # the most recent days are used for testing os.system('rm ' + output + mtype) # load stop words stopWords = set() with open(stopWords_file) as file: for word in file: stopWords.add(word.strip()) # build feature matrix word2idx = {'START': 0, 'END': 1} idx2word = ['START', 'END'] current_idx = 2 word_idx_count = {0: float('inf'), 1: float('inf')} sentences, labels = [], [] # os.system('cat ./input/news/*/* > ./input/news_reuters.csv') with open(news_file) as f: for num, line in enumerate(f): line = line.strip().split(',') if len(line) not in [6, 7]: continue if len(line) == 6: ticker, name, day, headline, body, newsType = line else: ticker, name, day, headline, body, newsType, suggestion = line if newsType != 'topStory': # newsType: [topStory, normal] continue # skip normal news if ticker not in priceDt: continue # skip if no corresponding company found if not priceDt[ticker]: continue # skip if corresponding company has price info as None if day not in priceDt[ticker]['open']: continue # skip if no corresponding date found if num % 10000 == 0: print("%sing samples %d" % (mtype, num)) if mtype == "test" and day.replace('-', '') not in testDates: continue if mtype == "train" and day.replace('-', '') in testDates: continue content = headline + ' ' + body content = content.replace("-", " ") tokens = util.tokenize_news(content, stopWords) for t in tokens: if t not in word2idx: word2idx[t] = current_idx idx2word.append(t) current_idx += 1 idx = word2idx[t] word_idx_count[idx] = word_idx_count.get(idx, 0) + 1 sentence_by_idx = [ word2idx[t] for t in tokens if t not in stopWords ] sentences.append(sentence_by_idx) price_dict_of_current_ticker = priceDt[ticker] open_price = price_dict_of_current_ticker['open'][day] close_price = price_dict_of_current_ticker['close'][day] change = (open_price - close_price) / open_price labels.append(round(change, 6)) # restrict vocab size sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True) word2idx_small = {} new_idx = 0 idx_new_idx_map = {} total_num, cdf = 0.0, 0.0 for idx, count in sorted_word_idx_count[:n_vocab]: if count == "inf" or count == float('inf'): continue total_num += count for idx, count in sorted_word_idx_count[:n_vocab]: word = idx2word[idx] if count == "inf" or count == float('inf'): continue cdf += (count * 1.0 / (total_num * 1.0)) # print(word, count, str(cdf)[:5]) word2idx_small[word] = new_idx idx_new_idx_map[idx] = new_idx new_idx += 1 # let 'unknown' be the last token word2idx_small['UNKNOWN'] = new_idx unknown = new_idx # map old idx to new idx features = [] # shorter sentence idx for num, sentence in enumerate(sentences): if len(sentence) > 1: new_sentence = [ idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence ] # padding if len(new_sentence) > sen_len: new_sentence = new_sentence[:sen_len] else: new_sentence = new_sentence + [1] * (sen_len - len(new_sentence)) new_sentence.append(labels[num]) features.append(new_sentence) features = np.matrix(features) print(features.shape) with open(output_wd2idx, 'w') as fp: json.dump(word2idx_small, fp) with open(output + mtype, 'a+') as file: np.savetxt(file, features, fmt="%s")