예제 #1
0
def gen_FeatureMatrix(news_file, price_file, stopWords_file, output, wordDict,
                      dim_wordVec, sentense_len, term_type, mtype):
    with open(price_file) as file:
        print("Loading price info ...")
        priceDt = json.load(file)[term_type]
    cnt = 0
    testDates = util.dateGenerator(300)
    os.system('rm ' + output + mtype)

    stopWords = set()
    with open(stopWords_file) as file:
        for word in file:
            stopWords.add(word.strip())

    with open(news_file) as f:
        for line in f:
            line = line.strip().split(',')
            if len(line) != 6: continue
            '''
            newsType: [topStory, normal]
            '''
            ticker, name, day, headline, body, newsType = line
            if newsType != 'topStory': continue  # skip normal news
            if ticker not in priceDt:
                continue  # skip if no corresponding company found
            if day not in priceDt[ticker]:
                continue  # skip if no corresponding date found
            cnt += 1
            # if cnt > 20: continue
            if cnt % 1000 == 0: print("%sing samples %d" % (mtype, cnt))
            if mtype == "test" and day not in testDates: continue
            if mtype == "train" and day in testDates: continue
            # 2.1 tokenize sentense, check if the word belongs to the top words, unify the format of words
            #headline = headline.encode('utf-8')
            #body = body.encode('utf-8')

            tokens = nltk.word_tokenize(headline)  # + nltk.word_tokenize(body)
            tokens = map(util.unify_word, tokens)

            # build feature and label
            feature = np.zeros([0, dim_wordVec])
            featureNone = True
            for t in tokens:
                # if t in stopWords: continue
                if t not in wordDict: continue
                featureNone = False
                feature = np.vstack((feature, np.matrix(wordDict[t])))
            if featureNone: continue  # feature is empty, continue

            feature = util.padding(feature, sentense_len)
            label = round(priceDt[ticker][day], 6)

            with open(output + mtype, 'a+') as file:
                np.savetxt(file,
                           np.hstack((feature, np.matrix(label))),
                           fmt='%.5f')
def gen_FeatureMatrix(news_file, price_file, stopWords_file, output, wordDict, dim_wordVec, sentense_len, term_type, mtype):
    with open(price_file) as file:
        print("Loading price info ...")
        priceDt = json.load(file)[term_type]
    cnt = 0
    testDates = util.dateGenerator(300)
    os.system('rm ' + output + mtype)

    stopWords = set()
    with open(stopWords_file) as file:
        for word in file:
            stopWords.add(word.strip())

    with open(news_file) as f:
        for line in f:
            line = line.strip().split(',')
            if len(line) != 6: continue
            '''
            newsType: [topStory, normal]
            '''
            ticker, name, day, headline, body, newsType = line
            if newsType != 'topStory': continue # skip normal news
            if ticker not in priceDt: continue # skip if no corresponding company found
            if day not in priceDt[ticker]: continue # skip if no corresponding date found
            cnt += 1
            # if cnt > 20: continue
            if cnt % 1000 == 0: print("%sing samples %d" % (mtype, cnt))
            if mtype == "test" and day not in testDates: continue
            if mtype == "train" and day in testDates: continue
            # 2.1 tokenize sentense, check if the word belongs to the top words, unify the format of words
            #headline = headline.encode('utf-8')
            #body = body.encode('utf-8')

            tokens = nltk.word_tokenize(headline) # + nltk.word_tokenize(body)
            tokens = map(util.unify_word, tokens)

            # build feature and label
            feature = np.zeros([0, dim_wordVec])
            featureNone = True
            for t in tokens:
                # if t in stopWords: continue
                if t not in wordDict: continue
                featureNone = False
                feature = np.vstack((feature, np.matrix(wordDict[t])))
            if featureNone: continue # feature is empty, continue

            feature = util.padding(feature, sentense_len)
            label = round(priceDt[ticker][day], 6)

            with open(output + mtype, 'a+') as file:
                np.savetxt(file, np.hstack((feature, np.matrix(label))), fmt='%.5f')
def tokenize(news_file, price_file, stopWords_file, output, sentense_len,
             term_type, n_vocab, mtype):
    # load price data
    with open(price_file) as file:
        print("Loading price info ...")
        priceDt = json.load(file)[term_type]

    testDates = util.dateGenerator(
        1)  # the most recent days are used for testing
    os.system('rm ' + output + mtype)

    # load stop words
    stopWords = set()
    with open(stopWords_file) as file:
        for word in file:
            stopWords.add(word.strip())

    # build feature matrix
    word2idx = {'START': 0, 'END': 1}
    idx2word = ['START', 'END']
    current_idx = 2
    word_idx_count = {0: float('inf'), 1: float('inf')}
    sentences, labels = [], []
    with open(news_file) as f:
        for num, line in enumerate(f):
            line = line.strip().split(',')
            if len(line) != 6:
                continue
            ticker, name, day, headline, body, newsType = line

            if newsType != 'topStory':  # newsType: [topStory, normal]
                continue  # skip normal news

            if ticker not in priceDt:
                continue  # skip if no corresponding company found
            if day not in priceDt[ticker]:
                continue  # skip if no corresponding date found

            if num % 10000 == 0:
                print("%sing samples %d" % (mtype, num))
            if mtype == "test" and day not in testDates:
                continue
            if mtype == "train" and day in testDates:
                continue

            tokens = nltk.word_tokenize(headline) + nltk.word_tokenize(body)
            tokens = list(map(util.unify_word, tokens))
            tokens = list(map(util.unify_word, tokens))

            for t in tokens:
                if t in stopWords:
                    continue
                if t not in word2idx:
                    word2idx[t] = current_idx
                    idx2word.append(t)
                    current_idx += 1
                idx = word2idx[t]
                word_idx_count[idx] = word_idx_count.get(idx, 0) + 1
            sentence_by_idx = [
                word2idx[t] for t in tokens if t not in stopWords
            ]
            sentences.append(sentence_by_idx)
            labels.append(round(priceDt[ticker][day], 6))

    # restrict vocab size
    sorted_word_idx_count = sorted(word_idx_count.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)
    word2idx_small = {}
    new_idx = 0
    idx_new_idx_map = {}
    for idx, count in sorted_word_idx_count[:n_vocab]:
        word = idx2word[idx]
        print(word, count)
        word2idx_small[word] = new_idx
        idx_new_idx_map[idx] = new_idx
        new_idx += 1
    # let 'unknown' be the last token
    word2idx_small['UNKNOWN'] = new_idx
    unknown = new_idx

    # map old idx to new idx
    features = []  # shorter sentence idx
    for num, sentence in enumerate(sentences):
        if len(sentence) > 1:
            new_sentence = [
                idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown
                for idx in sentence
            ]
            # padding
            if len(new_sentence) > sentense_len:
                new_sentence = new_sentence[:sentense_len]
            else:
                new_sentence = new_sentence + [1] * (sentense_len -
                                                     len(new_sentence))
            new_sentence.append(labels[num])
            features.append(new_sentence)

    features = np.matrix(features)
    print(features.shape)

    with open(output + mtype, 'a+') as file:
        np.savetxt(file, features, fmt="%s")
def tokenize(news_file, price_file, stopWords_file, output, output_wd2idx,
             sen_len, term_type, n_vocab, mtype):
    # load price data
    with open(price_file) as file:
        print("Loading price info ... " + mtype)
        priceDt = json.load(file)

    testDates = util.dateGenerator(
        20)  # the most recent days are used for testing
    os.system('rm ' + output + mtype)

    # load stop words
    stopWords = set()
    with open(stopWords_file) as file:
        for word in file:
            stopWords.add(word.strip())

    # build feature matrix
    word2idx = {'START': 0, 'END': 1}
    idx2word = ['START', 'END']
    current_idx = 2
    word_idx_count = {0: float('inf'), 1: float('inf')}
    sentences, labels = [], []
    # os.system('cat ./input/news/*/* > ./input/news_reuters.csv')
    with open(news_file) as f:
        for num, line in enumerate(f):
            line = line.strip().split(',')
            if len(line) not in [6, 7]:
                continue
            if len(line) == 6:
                ticker, name, day, headline, body, newsType = line
            else:
                ticker, name, day, headline, body, newsType, suggestion = line
            if newsType != 'topStory':  # newsType: [topStory, normal]
                continue  # skip normal news

            if ticker not in priceDt:
                continue  # skip if no corresponding company found
            if not priceDt[ticker]:
                continue  # skip if corresponding company has price info as None
            if day not in priceDt[ticker]['open']:
                continue  # skip if no corresponding date found

            if num % 10000 == 0:
                print("%sing samples %d" % (mtype, num))
            if mtype == "test" and day.replace('-', '') not in testDates:
                continue
            if mtype == "train" and day.replace('-', '') in testDates:
                continue
            content = headline + ' ' + body
            content = content.replace("-", " ")
            tokens = util.tokenize_news(content, stopWords)

            for t in tokens:
                if t not in word2idx:
                    word2idx[t] = current_idx
                    idx2word.append(t)
                    current_idx += 1
                idx = word2idx[t]
                word_idx_count[idx] = word_idx_count.get(idx, 0) + 1
            sentence_by_idx = [
                word2idx[t] for t in tokens if t not in stopWords
            ]
            sentences.append(sentence_by_idx)
            price_dict_of_current_ticker = priceDt[ticker]
            open_price = price_dict_of_current_ticker['open'][day]
            close_price = price_dict_of_current_ticker['close'][day]
            change = (open_price - close_price) / open_price
            labels.append(round(change, 6))

    # restrict vocab size
    sorted_word_idx_count = sorted(word_idx_count.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)
    word2idx_small = {}
    new_idx = 0
    idx_new_idx_map = {}
    total_num, cdf = 0.0, 0.0

    for idx, count in sorted_word_idx_count[:n_vocab]:
        if count == "inf" or count == float('inf'):
            continue
        total_num += count
    for idx, count in sorted_word_idx_count[:n_vocab]:
        word = idx2word[idx]
        if count == "inf" or count == float('inf'):
            continue
        cdf += (count * 1.0 / (total_num * 1.0))
        # print(word, count, str(cdf)[:5])
        word2idx_small[word] = new_idx
        idx_new_idx_map[idx] = new_idx
        new_idx += 1

    # let 'unknown' be the last token
    word2idx_small['UNKNOWN'] = new_idx
    unknown = new_idx

    # map old idx to new idx
    features = []  # shorter sentence idx
    for num, sentence in enumerate(sentences):
        if len(sentence) > 1:
            new_sentence = [
                idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown
                for idx in sentence
            ]
            # padding
            if len(new_sentence) > sen_len:
                new_sentence = new_sentence[:sen_len]
            else:
                new_sentence = new_sentence + [1] * (sen_len -
                                                     len(new_sentence))
            new_sentence.append(labels[num])
            features.append(new_sentence)

    features = np.matrix(features)
    print(features.shape)

    with open(output_wd2idx, 'w') as fp:
        json.dump(word2idx_small, fp)

    with open(output + mtype, 'a+') as file:
        np.savetxt(file, features, fmt="%s")