def read_data(data_link): data = pd.read_csv(data_link, sep='\t', header=None) data.columns = ["movie_id", 1, "movie_name", 3, 4, 5, 6, 7, "genre"] movies = cd.read_plot_from_corpus(file_link) movies = cd.merge_data(movies, data) movies["genre_new"] = cd.convert_genres(movies) movies = cd.remove_empty_rows(movies) clean_plot = movies['plot'].apply(lambda x: cd.clean_text(x)) movies['clean_plot'] = clean_plot movies['clean_plot'] = movies['clean_plot'].apply( lambda x: cd.remove_stopwords(x)) return movies
def testCleanText(self): self.assertEqual( 'this text has text', clean_data.clean_text( 'This TEXT [1] has 123 text The Latin Library'))
def create_data(self): data = {} train = {} dev = {} test = {} ignore_list = ["SNG1213", "PMUL0382", "PMUL0237"] logger.info("Processing data...") for dial_id, dial in tqdm(self.data.items()): dial_id = dial_id.split(".")[0] if dial_id in ignore_list: continue dialogue = {} goal = {} dial_domains = [] for key, value in dial["goal"].items(): # process user's goal if key in ontology.all_domains and value != {}: if value.get("reqt"): # normalize requestable slot names for idx, slot in enumerate(value["reqt"]): if ontology.normlize_slot_names.get(slot): value["reqt"][idx] = ontology.normlize_slot_names[slot] goal[key] = value dial_domains.append(key) if len(dial_domains) == 0: # ignore police and hospital ignore_list.append(dial_id) continue dialogue["goal"] = goal dialogue["log"] = [] acts = self.acts[dial_id] turn = {} for turn_num, turn_dial in enumerate(dial["log"]): meta_data = turn_dial["metadata"] if meta_data == {}: # user turn turn["turn_num"] = int(turn_num/2) turn["user"] = clean_text(turn_dial["text"]) else: # system turn turn["response"] = clean_text(turn_dial["text"]) belief = {} gate = {} act = {} for domain in dial_domains: # active domains of dialogue for slot, value in meta_data[domain]["book"].items(): # book if slot == "booked": continue slot, value = clean_slot_values(domain, slot, value) if value != "": belief["{}-{}".format(domain, slot)] = value gate["{}-{}".format(domain, slot)] = ontology.gate_dict[value] if value == "don't care" else ontology.gate_dict["prediction"] for slot, value in meta_data[domain]["semi"].items(): # semi slot, value = clean_slot_values(domain, slot, value) if value != "": belief["{}-{}".format(domain, slot)] = value gate["{}-{}".format(domain, slot)] = ontology.gate_dict[value] if value == "don't care" else ontology.gate_dict["prediction"] turn["belief"] = belief turn["gate"] = gate if acts.get(str(turn["turn_num"]+1)) and type(acts.get(str(turn["turn_num"]+1))) != str: # mapping system action for domain_act, slots in acts[str(turn["turn_num"]+1)].items(): act_temp = [] for slot in slots: # slot: [slot, value] slot_, value_ = clean_slot_values(domain_act.split("-")[0], slot[0], slot[1]) if slot_ == "none" or value_ in ["?", "none"]: # general domain or request slot or parking act_temp.append(slot_) else: act_temp.append("{}-{}".format(slot_, value_)) act[domain_act.lower()] = act_temp turn["action"] = act dialogue["log"].append(turn) turn = {} # clear turn data[dial_id] = dialogue logger.info("Processing finished.") logger.info("Dividing data to train/dev/test...") for dial_id in self.train_list: dial_id = dial_id.split(".")[0] if dial_id not in ignore_list: train[dial_id] = data[dial_id] for dial_id in self.dev_list: dial_id = dial_id.split(".")[0] if dial_id not in ignore_list: dev[dial_id] = data[dial_id] for dial_id in self.test_list: dial_id = dial_id.split(".")[0] if dial_id not in ignore_list: test[dial_id] = data[dial_id] logger.info("Dividing finished.") value_ontology = json.load(open(os.path.join(self.data_path, "ontology.json"), "r")) value_ontology_processed = {} logger.info("Processing ontology...") for domain_slot, values in value_ontology.items(): domain = domain_slot.split("-")[0] slot = domain_slot.split("-")[2].lower() if ontology.normlize_slot_names.get(slot): slot = ontology.normlize_slot_names[slot] domain_slot = "-".join([domain, slot]) value_ontology_processed[domain_slot] = [] for value in values: _, value = clean_slot_values(domain, slot, value) value_ontology_processed[domain_slot].append(value) with open(os.path.join(data_path, "ontology_processed.json"), "w") as f: json.dump(value_ontology_processed, f, indent=2) logger.info("Ontology was processed.") return train, dev, test
def create_data(self): data = {} train = {} dev = {} test = {} ignore_list = ["SNG1213", "PMUL0382", "PMUL0237"] logger.info("Processing data...") for dial_id, dial in tqdm(self.data.items()): dial_id = dial_id.split(".")[0] if dial_id in ignore_list: continue dialogue = {} goal = {} dial_domains = [] for key, value in dial["goal"].items(): # process user's goal if key in ontology.all_domains and value != {}: if value.get("reqt"): # normalize requestable slot names for idx, slot in enumerate(value["reqt"]): if ontology.normlize_slot_names.get(slot): value["reqt"][ idx] = ontology.normlize_slot_names[slot] goal[key] = value dial_domains.append(key) if len(dial_domains) == 0: # ignore police and hospital ignore_list.append(dial_id) continue dialogue["goal"] = goal dialogue["log"] = [] acts = self.acts[dial_id] turn = {} for turn_num, turn_dial in enumerate(dial["log"]): meta_data = turn_dial["metadata"] if meta_data == {}: # user turn turn["turn_num"] = int(turn_num / 2) turn["user"] = clean_text(turn_dial["text"]) else: # system turn turn["response"] = clean_text(turn_dial["text"]) response_ = clean_text(turn_dial["text"]) for k, v in self.delex_dict.items(): response_ = response_.replace(k, v) # delexicalize values is_train = False for token in [ "train", "arrive", "arrives", "arrived", "arriving", "arrival", "destination", "reach", "leave", "leaves", "leaving", "leaved", "depart", "departing", "departs", "departure", "[train_" ]: if token in response_: is_train = True break if is_train: for k, v in self.train_time_dict.items(): response_ = response_.replace( k, v) # delexicalize train times response_ = re.sub("(\d\s?){11}", "[phone]", response_) # delexicalize phone number while response_.find( "[train_time]" ) != -1: # replace [train_time] to [train_arrive] or [train_leave] by rule response_split = response_.split() idx = response_split.index("[train_time]") replaced = False for token in response_split[:idx][::-1]: if token in [ "arrive", "arrives", "arrived", "arriving", "arrival", "destination", "reach", "by", "before", "have", "to" ]: response_split[idx] = "[train_arrive]" replaced = True break elif token in [ "leave", "leaves", "leaving", "leaved", "depart", "departing", "departs", "departure", "from", "after", "earlier", "there" ]: response_split[idx] = "[train_leave]" replaced = True break if not replaced: response_split[idx] = "[train_leave]" response_ = " ".join(response_split) turn["response_delex"] = response_ belief = {} gate = {} act = {} for domain in dial_domains: # active domains of dialogue for slot, value in meta_data[domain]["book"].items( ): # book if slot == "booked": continue slot, value = clean_slot_values( domain, slot, value) if value != "": belief["{}-{}".format(domain, slot)] = value gate["{}-{}".format( domain, slot )] = ontology.gate_dict[ value] if value == "dontcare" else ontology.gate_dict[ "prediction"] for slot, value in meta_data[domain]["semi"].items( ): # semi slot, value = clean_slot_values( domain, slot, value) if value != "": belief["{}-{}".format(domain, slot)] = value gate["{}-{}".format( domain, slot )] = ontology.gate_dict[ value] if value == "dontcare" else ontology.gate_dict[ "prediction"] turn["belief"] = belief turn["gate"] = gate if acts.get(str(turn["turn_num"] + 1)) and type( acts.get(str(turn["turn_num"] + 1))) != str: # mapping system action for domain_act, slots in acts[str(turn["turn_num"] + 1)].items(): act_temp = [] for slot in slots: # slot: [slot, value] slot_, value_ = clean_slot_values( domain_act.split("-")[0], slot[0], slot[1]) if slot_ == "none" or value_ in [ "?", "none" ]: # general domain or request slot or parking act_temp.append(slot_) else: act_temp.append("{}-{}".format( slot_, value_)) act[domain_act.lower()] = act_temp turn["action"] = act dialogue["log"].append(turn) turn = {} # clear turn data[dial_id] = dialogue logger.info("Processing finished.") logger.info("Dividing data to train/dev/test...") for dial_id in self.train_list: dial_id = dial_id.split(".")[0] if dial_id not in ignore_list: train[dial_id] = data[dial_id] for dial_id in self.dev_list: dial_id = dial_id.split(".")[0] if dial_id not in ignore_list: dev[dial_id] = data[dial_id] for dial_id in self.test_list: dial_id = dial_id.split(".")[0] if dial_id not in ignore_list: test[dial_id] = data[dial_id] logger.info("Dividing finished.") value_ontology = json.load( open(os.path.join(self.data_path, "ontology.json"), "r")) value_ontology_processed = {} logger.info("Processing ontology...") for domain_slot, values in value_ontology.items(): domain = domain_slot.split("-")[0] slot = domain_slot.split("-")[2].lower() if ontology.normlize_slot_names.get(slot): slot = ontology.normlize_slot_names[slot] domain_slot = "-".join([domain, slot]) value_ontology_processed[domain_slot] = [] for value in values: _, value = clean_slot_values(domain, slot, value) value_ontology_processed[domain_slot].append(value) with open(os.path.join(data_path, "ontology_processed.json"), "w") as f: json.dump(value_ontology_processed, f, indent=2) logger.info("Ontology was processed.") return train, dev, test
def sentiment(): if request.method == "POST": search_term = request.form['search_term'] count = int(request.form['count']) tweets = tweepy.Cursor(api.search, q=search_term).items(count) positive = 0 negative = 0 neutral = 0 polarity = 0 def percentage(part, whole): return 100 * float(part) / float(whole) for data in tweets: tweet = data.text tweet = clean_text(tweet) analysis = TextBlob(tweet) polarity += analysis.sentiment.polarity if analysis.sentiment.polarity == 0.00: neutral += 1 elif analysis.sentiment.polarity > 0.00: positive += 1 elif analysis.sentiment.polarity < 0.00: negative += 1 positivity = percentage(positive, count) negativity = percentage(negative, count) neutrality = percentage(neutral, count) reaction = polarity / count positive = format(positivity, '.2f') negative = format(negativity, '.2f') neutral = format(neutrality, '.2f') labels = [ 'Positive[' + str(positivity) + '%]', 'Negative[' + str(negativity) + '%]', 'Neutral[' + str(neutrality) + '%]' ] sizes = [positivity, negativity, neutrality] colors = ['green', 'red', 'orange'] fig, ax = plt.subplots() ax.pie( sizes, colors=colors, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, ) inner_circle = plt.Circle((0, 0), 0.70, fc='white') fig = plt.gcf() fig.gca().add_artist(inner_circle) ax.axis('equal') plt.title('People reaction on ' + "'" + search_term.capitalize() + "'" + ' by analyzing ' + str(count) + ' Tweets') #ax.set_title("Course Attendance\n",fontsize=24) plt.tight_layout() canvas = FigureCanvas(fig) img = BytesIO() fig.savefig(img) img.seek(0) return send_file(img, mimetype='image/png')