示例#1
0
class Dataset(object):
    def __init__(self, news_path=defaults.NEWS_PATH,
                       tweets_path=defaults.TWEETS_PATH,
                       resolve_url_map_path=defaults.RESOLVE_URL_MAP_PATH,
                       fraction=defaults.DATASET_FRACTION,
                       init_by_prepared_tweets=None,
                       percent_of_unique_words=0.0):

        self.type = 'auto' if init_by_prepared_tweets is None else 'manual'
        self.news = load('cutted_news_storage')#NewsStorage(news_path)
        self.tweets = TweetsStorage(tweets_path, fraction, init_by_prepared_tweets)
        self.text_to_text_links = None
        self.percent_of_unique_words = percent_of_unique_words

        if self.type == 'auto':
            url_resolver = UrlResolver(resolve_url_map_path)
            self.tweets.resolve_urls(url_resolver)
            self.tweets.filter(self.news)

        #print self.tweets.length()
        if self.percent_of_unique_words > 0.0:
            self.tweets.filter_not_unique_tweets(self.news, self.percent_of_unique_words)

        #print self.tweets.length()
        self.lemmatized_texts = lemmatize_texts(self.get_texts())
        self.corpus, self.tf_idf_matrix = build_tf_idf_matrix(self.lemmatized_texts)

        logging.info('Dataset {NAME} builded and consist of {NUM_TWEETS} tweets and {NUM_NEWS} news'.format(
            NAME=self.name(),
            NUM_TWEETS=self.tweets.length(),
            NUM_NEWS=self.news.length()))

    def get_texts(self):
        return self.news.get_texts() + self.tweets.get_texts()

    def get_documents(self):
        return self.news.get_documents() + self.tweets.get_documents()


    def init_text_to_text_links(self):
        logging.info('Finding text to text links for {NAME}'.format(NAME=self.name()))
        lemmatizer = Lemmatizer()
        index = 0
        for _news in self.news.get_documents():
            _news.index = index
            index += 1

        for tweet in self.tweets.get_documents():
            tweet.words = lemmatizer.split_text_to_lemmas(tweet.text)
            tweet.index = index
            index += 1

        #print len(tweets), len(news)
        similarity_matrix = get_similarity_matrix(self.get_documents(), self.get_documents(), self.corpus, self.tf_idf_matrix)
        #print 'preparation finished'
        self.text_to_text_links = get_text_to_text_relation(self.news.get_documents(), self.tweets.get_documents(), similarity_matrix)

    def name(self):
        return 'dataset_{TYPE}_{UNIQUE_PERCENT}'.format(TYPE=self.type, UNIQUE_PERCENT=self.percent_of_unique_words)
示例#2
0
文件: main.py 项目: art-vybor/twnews
def main():
    args = parse_args()

    logging.info('--------------- Twnews started ------------------')

    if args.subparser == 'tweets_sample':
        log_and_print(logging.INFO, 'get sample of random tweets')
        length = args.length
        tweets_filepath = args.tweets
        tweets_dirname, tweets_filename = split_filepath(tweets_filepath)

        storage = TweetsStorage(defaults.TWEETS_PATH, num_of_documents=length, sorted_keys=False)

        dump(storage.get_documents(), tweets_filename, tweets_dirname)
        # with open(tweets_filepath, 'w') as f:
        #     for doc in storage.get_documents():
        #         f.write('{ID} {TEXT}\n'.format(ID=doc.tweet_id, TEXT=str(doc).replace('\n',' ')))

        log_and_print(logging.INFO, 'sample generated and saved at {PATH}'.format(PATH=tweets_filepath))

    elif args.subparser == 'resolver':
        log_and_print(logging.INFO, 'url resolver')

        if args.resolve:
            log_and_print(logging.INFO, 'resolve all urls')
            resolve(sample_size=None)
            log_and_print(logging.INFO, 'all urls resolved')
        elif args.analyze:
            log_and_print(logging.INFO, 'stats of resolved urls')
            url_analyse()

    elif args.subparser == 'build_dataset':
        log_and_print(logging.INFO, 'building automatic dataset')
        dataset_filepath = args.dataset
        dataset_dirname, dataset_filename = split_filepath(dataset_filepath)
        unique_words = args.unique_words

        dataset = Dataset(fraction=1, percent_of_unique_words=unique_words)
        dataset.init_text_to_text_links()

        dump(dataset, dataset_filename, dataset_dirname)

        log_and_print(logging.INFO, 'dataset builded and saved at {PATH}'.format(PATH=dataset_filepath))

    elif args.subparser == 'train':
        log_and_print(logging.INFO, 'train model')
        dataset_filepath = args.dataset
        dataset_dirname, dataset_filename = split_filepath(dataset_filepath)
        dataset_applied_filepath = args.dataset_applied
        dataset_applied_dirname, dataset_applied_filename = split_filepath(dataset_applied_filepath)
        model_dirname = args.model_dir

        dataset = load(dataset_filename, dataset_dirname)

        if args.wtmf:
            model = WTMF(dataset, dirname=model_dirname)
        elif args.wtmf_g:
            model = WTMF_G(dataset, dirname=model_dirname)

        #log_and_print(logging.INFO, 'train {NAME} model'.format(NAME=model.name()))
        model.build()
        #log_and_print(logging.INFO, 'model {NAME} builded'.format(NAME=model.name()))
        #log_and_print(logging.INFO, 'model saved: {PATH}'.format(PATH=os.path.join(model_dirname, model.name())))

        dump(model.dataset_applied, dataset_applied_filename, dataset_applied_dirname)
        log_and_print(logging.INFO, 'applied dataset saved: {PATH}'.format(PATH=dataset_applied_filepath))

    elif args.subparser == 'apply':
        log_and_print(logging.INFO, 'apply model')
        model_filepath = args.model
        model_dirname, model_filename = split_filepath(model_filepath)
        tweets_filepath = args.tweets
        tweets_dirname, tweets_filename = split_filepath(tweets_filepath)
        tweets_applied_filepath = args.tweets_applied
        tweets_applied_dirname, tweets_applied_filename = split_filepath(tweets_applied_filepath)

        if args.wtmf:
            model = WTMF(model_name=model_filename, dirname=model_dirname)
        elif args.wtmf_g:
            model = WTMF_G(model_name=model_filename, dirname=model_dirname)

        tweets = load(tweets_filename, tweets_dirname)

        corpus = model.words

        texts = map(lambda x: x.get_text(), tweets)
        texts = lemmatize_texts(texts)
        _, tf_idf_matrix = build_tf_idf_matrix(texts, vocabulary=corpus)

        model.texts = texts
        model.tf_idf_matrix = tf_idf_matrix
        result_matrix = model.apply()

        set_compare_vector(tweets, result_matrix)
        dump(tweets, tweets_applied_filename, tweets_applied_dirname)

        log_and_print(logging.INFO, 'tweets applied and stored at {PATH}'.format(PATH=tweets_applied_filepath))

    elif 'tfidf' in args.subparser:
        log_and_print(logging.INFO, 'apply tfidf to dataset')
        dataset_filepath = args.dataset
        dataset_dirname, dataset_filename = split_filepath(dataset_filepath)
        dataset_applied_filepath = args.dataset_applied
        dataset_applied_dirname, dataset_applied_filename = split_filepath(dataset_applied_filepath)

        dataset = load(dataset_filename, dataset_dirname)

        news_num = dataset.news.length()
        documents = dataset.get_documents()

        set_compare_vector(documents, dataset.tf_idf_matrix)
        news, tweets = documents[:news_num], documents[news_num:]
        dump((news, tweets), dataset_applied_filename, dataset_applied_dirname)

        log_and_print(logging.INFO, 'dataset applied and stored at {PATH}'.format(
            PATH=dataset_applied_filepath))

        if args.subparser == 'tfidf_dataset':
            pass
        elif args.subparser == 'tfidf_tweets':
            log_and_print(logging.INFO, 'apply tfidf to tweets')
            tweets_filepath = args.tweets
            tweets_dirname, tweets_filename = split_filepath(tweets_filepath)
            tweets_applied_filepath = args.tweets_applied
            tweets_applied_dirname, tweets_applied_filename = split_filepath(tweets_applied_filepath)

            tweets = load(tweets_filename, tweets_dirname)

            texts = map(lambda x: x.get_text(), tweets)
            texts = lemmatize_texts(texts)
            _, result_matrix = build_tf_idf_matrix(texts, vocabulary=dataset.corpus)
            set_compare_vector(tweets, result_matrix)

            dump(tweets, tweets_applied_filename, tweets_applied_dirname)
            log_and_print(logging.INFO, 'tweets applied and stored at {PATH}'.format(
                PATH=tweets_applied_filepath))
        else:
            raise Exception('unexpected tfidf parser')

    elif args.subparser == 'build_recommendation':
        input_dir = args.input_dir
        output_dir = args.output_dir
        tweets_applied_filename = args.tweets_applied
        dataset_applied_filename = args.dataset_applied

        news, tweets = load(dataset_applied_filename, input_dir)
        is_dataset = True
        if tweets_applied_filename:
            tweets = load(tweets_applied_filename, input_dir)
            is_dataset = False

        recommendation, correct_news_idxs = recommend(news, tweets, top_size=10, is_dataset=is_dataset)
        dump((recommendation, correct_news_idxs), 'recommendation', output_dir)

        log_and_print(logging.INFO, 'recommendation builded and stored at {PATH}'.format(PATH=os.path.join(output_dir, 'recommendation')))

    elif 'recommend' in args.subparser:
        dataset_applied_filepath = args.dataset_applied
        dataset_applied_dirname, dataset_applied_filename = split_filepath(dataset_applied_filepath)

        dump_filepath = args.dump

        news, tweets = load(dataset_applied_filename, dataset_applied_dirname)

        if args.subparser == 'recommend_dataset':
            log_and_print(logging.INFO, 'build recommendation')
            recommendation, correct_news_idxs = recommend(news, tweets, top_size=10, evaluate=True)
            log_and_print(logging.INFO, 'recommendation result evaluation')
            print 'RR =', RR(correct_news_idxs)
            print 'TOP1 =', TOP1(correct_news_idxs)
            print 'TOP3 =', TOP3(correct_news_idxs)
            dump_to_csv(recommendation, dump_filepath, score_threshold=0.0)

        elif args.subparser == 'recommend_tweets':
            tweets_applied_filepath = args.tweets_applied
            tweets_applied_dirname, tweets_applied_filename = split_filepath(tweets_applied_filepath)
            tweets = load(tweets_applied_filename, tweets_applied_dirname)

            recommendation, _ = recommend(news, tweets, top_size=10, evaluate=False)
            dump_to_csv(recommendation, dump_filepath)
        else:
            raise Exception('unexpected recommend parser')


        log_and_print(logging.INFO, 'recommendation dumped to {PATH}'.format(PATH=dump_filepath))