Exemplo n.º 1
0
def run(path):
    # Assuming that all the xml files are present in your current working directory
    folderList = os.listdir(path)

    # Read in all the xml files - other files such as .py files will be not read in. Extract all aspects to be
    # visualized as wordclouds

    asp_sent_list = []
    for file_name in folderList:
        ext = os.path.splitext(file_name)[-1].lower()
        if ext == ".xml":
            aspects = asp.get_aspect(path + "/" + file_name)
            asp_cat = aspc.get_aspect_categories(aspects)
            # asp_sent = asps.get_aspect_sentiment(aspects, path + "/" + file_name, categories = False)
            if "food" in asp_cat:
                temp_aspects = aspc.get_aspect_by_category(aspects, "food")
                asp_sent = asps.get_aspect_sentiment(temp_aspects, path + "/" + file_name, categories = False)
                asp_sent_list.append([file_name[:-4], temp_aspects, asp_sent])

    # Read in the csv file in question. This is done to match the review ids to the restaurant id
    # because we want to summarize aspects by restaurant
    test = pd.read_csv("charlotte_top10.csv")

    # Create a list with restaurant_ids aspects along with sentimentss
    rev_rest_list = []
    for i, r in test.iterrows():
        for asl in asp_sent_list:
            if r['review_id']==asl[0]:
                rev_rest_list.append([r['restaurant_id'], asl[2] ])

    # Convert list to dataframe for easy handling
    rev_rest = pd.DataFrame(rev_rest_list, columns = ['rest_id','Aspect_Sent'])

    # Create groups by restaurant id
    grouped = rev_rest.groupby('rest_id')

    # Process data so that frequency of each aspect-sentiment pair is counted
    dict_check = []
    temp1 = []
    for g in grouped:
        h = g[1]
        temp = []
        for i,r in h.iterrows():
            c = Counter( item for item in r.Aspect_Sent.items())
            d = dict(c)
            temp.append(d)
            dict_check = functools.reduce(operator.add, map(collections.Counter, temp))
        temp1.append([g[0], dict(dict_check)])

    # Create an empty dataframe with the structure which tableau requires
    columns = ['rest_id', 'Aspect', 'Freq', 'Type']
    temp_df = pd.DataFrame(columns = columns)


    # Fill in the empty dataframe with the list created, separating out negative and positive sentiments by
    # creating a column called "Type" and also storing their frequency in the column "Freq"
    for t in temp1:
        for k,v in t[1].items():
            if k[1] == 'Positive':
                temp_df = temp_df.append({'rest_id': t[0], 'Aspect':k[0], 'Freq': v, 'Type':"Positive"}, ignore_index = True)
            elif k[1] == 'Negative':
                temp_df = temp_df.append({'rest_id': t[0], 'Aspect':k[0], 'Freq': v, 'Type': "Negative"}, ignore_index = True)

    temp_df.to_csv("result/question3.csv", encoding = "UTF-8")
Exemplo n.º 2
0
def run(path, file_name, time_grain):
    folderList = os.listdir(path)
    aspect_list = []
    # Read in the csv file in question
    test = pd.read_csv(file_name)
    # Create two fields of "year" and "month" which will store the extracted year and month
    # from the date
    test_check = test
    test_check['year'] = pd.DatetimeIndex(test_check['date']).year
    test_check['month'] = pd.DatetimeIndex(test_check['date']).month

    # Assuming all your xml files are present in the same path, extract all aspects from the reviews and store
    # them in a list
    for file_name in folderList:
        ext = os.path.splitext(file_name)[-1].lower()
        if ext == ".xml":
            print("Processing ..." + str(file_name[:-4]))
            aspects = asp.get_aspect(path + "/" + file_name)
            asp_cat = aspc.get_aspect_categories(aspects)
            asp_sent = asps.get_aspect_sentiment(aspects,
                                                 path + "/" + file_name,
                                                 categories=True)
            aspect_list.append([file_name[:-4], asp_cat, asp_sent])

    # Create an empty dataframe with the below structure
    columns = [
        'rest_id', 'rev_id', 'year', 'month', 'overall_sentiment', 'food',
        'price', 'service', 'ambience', 'others'
    ]
    vis_df = pd.DataFrame(columns=columns)

    # We have to start attaching the reviews to restaurants, create an empty list to do that
    tot_list = []
    # Populate the list
    for i, r in test_check.iterrows():
        for al in aspect_list:
            if r['review_id'] == al[0]:
                tot_list.append([
                    r['restaurant_id'], r['review_id'], r['year'], r['month'],
                    r['Sentiment'], al[2]
                ])

    # Populate the list
    for l in tot_list:
        food_val = check(l, 'food')
        price_val = check(l, 'price')
        service_val = check(l, 'service')
        ambience_val = check(l, 'ambience')
        others_val = check(l, 'others')
        vis_df = vis_df.append(
            {
                'rest_id': l[0],
                'rev_id': l[1],
                'year': l[2],
                'month': l[3],
                'overall_sentiment': l[4],
                'food': food_val,
                'price': price_val,
                'service': service_val,
                'ambience': ambience_val,
                'others': others_val
            },
            ignore_index=True)

    # Create an empty dataframe with the below structure and which will hold actual sentiment positive to
    # overall ratios
    columns = [
        'rest_id', time_grain, 'overall', 'food', 'price', 'service',
        'ambience', 'others'
    ]
    rest_df = pd.DataFrame(columns=columns)

    # Group your initial dataframe by restaurant id
    grouped = vis_df.groupby('rest_id')

    # Looping through each restaurant
    for g in grouped:
        h = g[1]
        # Grouping by time_grain (example month or year)
        g_child = h.groupby(time_grain)
        # Looping through each time_grain (example month or year)
        for gc in g_child:
            h1 = gc[1]
            # Calculating all ratios
            g_over = h1.groupby('overall_sentiment')
            over_ratio = get_group_size(g_over, "positive") / float(
                h1.shape[0])
            g_food = h1.groupby('food')
            food_ratio = get_group_size(g_food, "Positive") / float(
                h1.shape[0])
            g_price = h1.groupby('price')
            price_ratio = get_group_size(g_price, "Positive") / float(
                h1.shape[0])
            g_service = h1.groupby("service")
            service_ratio = get_group_size(g_service, "Positive") / float(
                h1.shape[0])
            g_ambience = h1.groupby("ambience")
            ambience_ratio = get_group_size(g_ambience, "Positive") / float(
                h1.shape[0])
            g_others = h1.groupby("others")
            others_ratio = get_group_size(g_others, "Positive") / float(
                h1.shape[0])
            rest_df = rest_df.append(
                {
                    'rest_id': g[0],
                    time_grain: gc[0],
                    'overall': over_ratio,
                    'food': food_ratio,
                    'price': price_ratio,
                    'service': service_ratio,
                    'ambience': ambience_ratio,
                    'others': others_ratio
                },
                ignore_index=True)
    # Writing to file
    rest_df.to_csv("result/question2.csv")
Exemplo n.º 3
0
def run(path, file_name, time_grain):
    folderList = os.listdir(path)
    aspect_list = []
    # Read in the csv file in question
    test = pd.read_csv(file_name)
    # Create two fields of "year" and "month" which will store the extracted year and month
    # from the date
    test_check = test
    test_check['year'] = pd.DatetimeIndex(test_check['date']).year
    test_check['month'] = pd.DatetimeIndex(test_check['date']).month

    # Assuming all your xml files are present in the same path, extract all aspects from the reviews and store
    # them in a list
    for file_name in folderList:
        ext = os.path.splitext(file_name)[-1].lower()
        if ext == ".xml":
            print("Processing ..." + str(file_name[:-4]))
            aspects = asp.get_aspect(path + "/" + file_name)
            asp_cat = aspc.get_aspect_categories(aspects)
            asp_sent = asps.get_aspect_sentiment(aspects, path + "/" + file_name, categories = True)
            aspect_list.append([file_name[:-4], asp_cat, asp_sent])

    # Create an empty dataframe with the below structure
    columns = ['rest_id', 'rev_id', 'year','month','overall_sentiment','food','price','service','ambience','others']
    vis_df = pd.DataFrame(columns=columns)

    # We have to start attaching the reviews to restaurants, create an empty list to do that
    tot_list = []
    # Populate the list
    for i, r in test_check.iterrows():
        for al in aspect_list:
            if r['review_id'] == al[0]:
                tot_list.append([r['restaurant_id'],r['review_id'],r['year'],r['month'], r['Sentiment'], al[2] ])

    # Populate the list
    for l in tot_list:
        food_val = check(l,'food')
        price_val = check(l,'price')
        service_val = check(l,'service')
        ambience_val = check(l,'ambience')
        others_val = check(l,'others')
        vis_df = vis_df.append({'rest_id': l[0], 'rev_id':l[1], 'year':l[2], 'month':l[3],'overall_sentiment':l[4], 'food':food_val, 'price':price_val, 'service':service_val, 'ambience':ambience_val, 'others':others_val }, ignore_index=True)

    # Create an empty dataframe with the below structure and which will hold actual sentiment positive to
    # overall ratios
    columns = ['rest_id', time_grain,'overall', 'food','price','service','ambience','others']
    rest_df = pd.DataFrame(columns=columns)

    # Group your initial dataframe by restaurant id
    grouped = vis_df.groupby('rest_id')


    # Looping through each restaurant
    for g in grouped:
        h = g[1]
        # Grouping by time_grain (example month or year)
        g_child = h.groupby(time_grain)
        # Looping through each time_grain (example month or year)
        for gc in g_child:
            h1 = gc[1]
            # Calculating all ratios
            g_over = h1.groupby('overall_sentiment')
            over_ratio = get_group_size(g_over, "positive")/float(h1.shape[0])
            g_food = h1.groupby('food')
            food_ratio =  get_group_size(g_food, "Positive")/float(h1.shape[0])
            g_price = h1.groupby('price')
            price_ratio =  get_group_size(g_price, "Positive")/float(h1.shape[0])
            g_service = h1.groupby("service")
            service_ratio =  get_group_size(g_service, "Positive")/float(h1.shape[0])
            g_ambience = h1.groupby("ambience")
            ambience_ratio =  get_group_size(g_ambience, "Positive")/float(h1.shape[0])
            g_others = h1.groupby("others")
            others_ratio =  get_group_size(g_others, "Positive")/float(h1.shape[0])
            rest_df = rest_df.append({'rest_id': g[0],time_grain:gc[0], 'overall':over_ratio, 'food':food_ratio, 'price':price_ratio, 'service':service_ratio, 'ambience':ambience_ratio, 'others':others_ratio }, ignore_index=True)
    # Writing to file
    rest_df.to_csv("result/question2.csv")