def extract_aspects(reviews):
    """
	INPUT: iterable of strings (pd Series, list)
	OUTPUT: list of aspects
	
	Return the aspects from the set of reviews
	"""

    # import the aspect extraction functions
    from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents

    # put all the sentences in all reviews in one stream
    sentences = []
    for review in reviews:
        sentences.extend(get_sentences(review.encode('utf-8', 'ignore')))

    tokenized_sentences = [
        tokenize(sentence) for sentence in sentences
        for sentences in get_sentences(review.encode('utf-8', 'ignore'))
    ]

    # tokenize each sentence
    #tokenized_sentences = [tokenize(sentence) for sentence in sentences]

    # pos tag each sentence
    tagged_sentences = [pos_tag(sentence) for sentence in tokenized_sentences]

    # from the pos tagged sentences, get a list of aspects
    aspects = aspects_from_tagged_sents(tagged_sentences)

    return aspects
Пример #2
0
def get_sentences_by_aspect(aspect, reviews):
    """
	INPUT: string (aspect), iterable of strings (full reviews)
	OUTPUT: iterable of strings

	Given an aspect and a list of reviews, return a list 
	sof all sentences that mention that aspect.  
	"""

    # THIS CODE IS TOTALLY COPIED FROM MAIN FILE function 'extract_aspects'
    # TODO: REFACTOR THIS IN AN INTELLIGENT WAY.

    from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents

    # get
    sentences = []
    for review in reviews:
        sentences.extend(get_sentences(review.encode('utf-8', 'ignore')))

    # tokenize each sentence
    tokenized_sentences = [
        tokenize(sentence) for sentence in sentences
        for sentences in get_sentences(review)
    ]

    return [sent for sent in tokenized_sentences if aspect in sent]
Пример #3
0
def extract_aspects(reviews):
	"""
	INPUT: iterable of strings (pd Series, list)
	OUTPUT: list of aspects
	
	Return the aspects from the set of reviews
	"""

	# import the aspect extraction functions
	from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents

	# put all the sentences in all reviews in one stream
	#sentences = []
	#for review in reviews: 
	#	sentences.extend(get_sentences(review))

	tokenized_sentences = [tokenize(sentence) for sentence in sentences
							for sentences in get_sentences(review)]

	# tokenize each sentence
	#tokenized_sentences = [tokenize(sentence) for sentence in sentences]

	# pos tag each sentence
	tagged_sentences = [pos_tag(sentence) for sentence in tokenized_sentences]

	# from the pos tagged sentences, get a list of aspects
	aspects = aspects_from_tagged_sents(tagged_sentences)

	return aspects
def aspect_opinions(tweets, aspects):
    from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents

    sentences = []
    for tweet in tweets:
        sentences.extend(get_sentences(tweets[tweet]))
    tokenized_sentences = [tokenize(sentence) for sentence in sentences]

    aspect_to_tweets = {}

    for tweet in tokenized_sentences:
        tweet_aspects = extract_aspects(tweet)
        relevant_aspects = get_relevant_aspects(tweet_aspects, aspects)
        for p_aspect in relevant_aspects:
            #value has the relevant aspects. create a dictionary and push the tweets
            p_aspects = relevant_aspects[p_aspect]
            for aspect in p_aspects:
                if aspect in aspect_to_tweets:
                    aspect_to_tweets[aspect].append(tweet)
                else:
                    aspect_to_tweets[aspect] = []
                    aspect_to_tweets[aspect].append(tweet)
    features = {}
    for aspect, tweets in aspect_to_tweets.items():
        features[aspect] = [len(tweets), score_aspect(tweets)]
    return features
def aspect_opinions(reviews, aspects):

	from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents

	sentences = []
	for review in reviews :
			sentences.extend(get_sentences(review))

	tokenized_sentences = [tokenize(sentence) for sentence in sentences]
	return dict([(aspect, score_aspect(tokenized_sentences, aspect)) for aspect in aspects])
Пример #6
0
def extract_aspects(reviews):
    from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents

    sentences = []
    for review in reviews:
        sentences.extend(get_sentences(review))
    tokenized_sentences = [tokenize(sentence) for sentence in sentences]
    tagged_sentences = [pos_tag(sentence) for sentence in tokenized_sentences]

    aspects = all_aspects_from_tagged_sents(tagged_sentences)

    return aspects
def get_sentences_by_aspect(aspect, reviews):
	"""
	INPUT: string (aspect), iterable of strings (full reviews)
	OUTPUT: iterable of strings

	Given an aspect and a list of reviews, return a list 
	sof all sentences that mention that aspect.  
	"""

	# THIS CODE IS TOTALLY COPIED FROM MAIN FILE function 'extract_aspects' 
	# TODO: REFACTOR THIS IN AN INTELLIGENT WAY. 

	from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents
	
	# get 
	#sentences = []
	#for review in reviews: 
	#	sentences.extend(get_sentences(review))

	# tokenize each sentence
	tokenized_sentences = [tokenize(sentence) for sentence in sentences
							for sentences in get_sentences(review)]

	return [sent for sent in tokenized_sentences if aspect in sent]
def aspect_opinions(reviews, aspects, relevant_features):

	from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents

	sentences = []
	for review in reviews :
			sentences.extend(get_sentences(review))

	tokenized_sentences = [tokenize(sentence) for sentence in sentences]
	scores = [(aspect[1], score_aspect(tokenized_sentences, aspect[0])) for aspect in aspects.items()]

	aspect_scores = {}
	for score in scores :
		klass = relevant_features[score[0]]
		if klass in aspect_scores :
			old_count = aspect_scores[klass][0]
			old_score = aspect_scores[klass][1]
			new_count = old_count + score[1][0]
			new_score = ( old_count * old_score + new_count * score[1][1] ) / (old_count + new_count)
			aspect_scores[klass] = (new_count, new_score)
		else :
			aspect_scores[klass] = (score[1][0], score[1][1])

	return aspect_scores