Python clean_text示例，clean_data.clean_text Python示例

示例#1

0

显示文件

文件： supervised_topic_classification_main.py 项目： tuananhbmt1996/Classification-Extension

def read_data(data_link):
    data = pd.read_csv(data_link, sep='\t', header=None)
    data.columns = ["movie_id", 1, "movie_name", 3, 4, 5, 6, 7, "genre"]
    movies = cd.read_plot_from_corpus(file_link)
    movies = cd.merge_data(movies, data)
    movies["genre_new"] = cd.convert_genres(movies)
    movies = cd.remove_empty_rows(movies)
    clean_plot = movies['plot'].apply(lambda x: cd.clean_text(x))
    movies['clean_plot'] = clean_plot
    movies['clean_plot'] = movies['clean_plot'].apply(
        lambda x: cd.remove_stopwords(x))
    return movies

示例#2

0

显示文件

文件： text_unit_tests.py 项目： huguensjean/LatinLearner

 def testCleanText(self):
     self.assertEqual(
         'this text has text',
         clean_data.clean_text(
             'This TEXT [1] has 123 text The Latin Library'))

示例#3

0

显示文件

文件： create_data.py 项目： gusalsdmlwlq/DS-DST

    def create_data(self):
        data = {}
        train = {}
        dev = {}
        test = {}
        ignore_list = ["SNG1213", "PMUL0382", "PMUL0237"]
        logger.info("Processing data...")
        for dial_id, dial in tqdm(self.data.items()):
            dial_id = dial_id.split(".")[0]
            if dial_id in ignore_list:
                continue
            dialogue = {}
            goal = {}
            dial_domains = []
            for key, value in dial["goal"].items():  # process user's goal
                if key in ontology.all_domains and value != {}:
                    if value.get("reqt"):  # normalize requestable slot names
                        for idx, slot in enumerate(value["reqt"]):
                            if ontology.normlize_slot_names.get(slot):
                                value["reqt"][idx] = ontology.normlize_slot_names[slot]
                    goal[key] = value
                    dial_domains.append(key)
            if len(dial_domains) == 0:  # ignore police and hospital
                ignore_list.append(dial_id)
                continue
            dialogue["goal"] = goal
        
            dialogue["log"] = []
            acts = self.acts[dial_id]
            turn = {}
            for turn_num, turn_dial in enumerate(dial["log"]):
                meta_data = turn_dial["metadata"]
                if meta_data == {}:  # user turn
                    turn["turn_num"] = int(turn_num/2)
                    turn["user"] = clean_text(turn_dial["text"])
                else:  # system turn
                    turn["response"] = clean_text(turn_dial["text"])
                    belief = {}
                    gate = {}
                    act = {}

                    for domain in dial_domains:  # active domains of dialogue
                        for slot, value in meta_data[domain]["book"].items():  # book
                            if slot == "booked":
                                continue
                            slot, value = clean_slot_values(domain, slot, value)
                            if value != "":
                                belief["{}-{}".format(domain, slot)] = value
                                gate["{}-{}".format(domain, slot)] = ontology.gate_dict[value] if value == "don't care" else ontology.gate_dict["prediction"]
                        for slot, value in meta_data[domain]["semi"].items():  # semi
                            slot, value = clean_slot_values(domain, slot, value)
                            if value != "":
                                belief["{}-{}".format(domain, slot)] = value
                                gate["{}-{}".format(domain, slot)] = ontology.gate_dict[value] if value == "don't care" else ontology.gate_dict["prediction"]
                    turn["belief"] = belief
                    turn["gate"] = gate

                    if acts.get(str(turn["turn_num"]+1)) and type(acts.get(str(turn["turn_num"]+1))) != str:  # mapping system action
                        for domain_act, slots in acts[str(turn["turn_num"]+1)].items():
                            act_temp = []
                            for slot in slots:  # slot: [slot, value]
                                slot_, value_ = clean_slot_values(domain_act.split("-")[0], slot[0], slot[1])
                                if slot_ == "none" or value_ in  ["?", "none"]:  # general domain or request slot or parking
                                    act_temp.append(slot_)
                                else:
                                    act_temp.append("{}-{}".format(slot_, value_))
                            act[domain_act.lower()] = act_temp
                    turn["action"] = act

                    dialogue["log"].append(turn)
                    turn = {}  # clear turn
            
            data[dial_id] = dialogue

        logger.info("Processing finished.")
        logger.info("Dividing data to train/dev/test...")
        for dial_id in self.train_list:
            dial_id = dial_id.split(".")[0]
            if dial_id not in ignore_list:
                train[dial_id] = data[dial_id]
        for dial_id in self.dev_list:
            dial_id = dial_id.split(".")[0]
            if dial_id not in ignore_list:
                dev[dial_id] = data[dial_id]
        for dial_id in self.test_list:
            dial_id = dial_id.split(".")[0]
            if dial_id not in ignore_list:
                test[dial_id] = data[dial_id]
        logger.info("Dividing finished.")

        value_ontology = json.load(open(os.path.join(self.data_path, "ontology.json"), "r"))
        value_ontology_processed = {}

        logger.info("Processing ontology...")
        for domain_slot, values in value_ontology.items():
            domain = domain_slot.split("-")[0]
            slot = domain_slot.split("-")[2].lower()
            if ontology.normlize_slot_names.get(slot):
                slot = ontology.normlize_slot_names[slot]
            domain_slot = "-".join([domain, slot])
            value_ontology_processed[domain_slot] = []
            for value in values:
                _, value = clean_slot_values(domain, slot, value)
                value_ontology_processed[domain_slot].append(value)
        with open(os.path.join(data_path, "ontology_processed.json"), "w") as f:
            json.dump(value_ontology_processed, f, indent=2)
        logger.info("Ontology was processed.")

        return train, dev, test

示例#4

0

显示文件

    def create_data(self):
        data = {}
        train = {}
        dev = {}
        test = {}
        ignore_list = ["SNG1213", "PMUL0382", "PMUL0237"]
        logger.info("Processing data...")
        for dial_id, dial in tqdm(self.data.items()):
            dial_id = dial_id.split(".")[0]
            if dial_id in ignore_list:
                continue
            dialogue = {}
            goal = {}
            dial_domains = []
            for key, value in dial["goal"].items():  # process user's goal
                if key in ontology.all_domains and value != {}:
                    if value.get("reqt"):  # normalize requestable slot names
                        for idx, slot in enumerate(value["reqt"]):
                            if ontology.normlize_slot_names.get(slot):
                                value["reqt"][
                                    idx] = ontology.normlize_slot_names[slot]
                    goal[key] = value
                    dial_domains.append(key)
            if len(dial_domains) == 0:  # ignore police and hospital
                ignore_list.append(dial_id)
                continue
            dialogue["goal"] = goal

            dialogue["log"] = []
            acts = self.acts[dial_id]
            turn = {}
            for turn_num, turn_dial in enumerate(dial["log"]):
                meta_data = turn_dial["metadata"]
                if meta_data == {}:  # user turn
                    turn["turn_num"] = int(turn_num / 2)
                    turn["user"] = clean_text(turn_dial["text"])
                else:  # system turn
                    turn["response"] = clean_text(turn_dial["text"])
                    response_ = clean_text(turn_dial["text"])
                    for k, v in self.delex_dict.items():
                        response_ = response_.replace(k,
                                                      v)  # delexicalize values

                    is_train = False
                    for token in [
                            "train", "arrive", "arrives", "arrived",
                            "arriving", "arrival", "destination", "reach",
                            "leave", "leaves", "leaving", "leaved", "depart",
                            "departing", "departs", "departure", "[train_"
                    ]:
                        if token in response_:
                            is_train = True
                            break
                    if is_train:
                        for k, v in self.train_time_dict.items():
                            response_ = response_.replace(
                                k, v)  # delexicalize train times

                    response_ = re.sub("(\d\s?){11}", "[phone]",
                                       response_)  # delexicalize phone number

                    while response_.find(
                            "[train_time]"
                    ) != -1:  # replace [train_time] to [train_arrive] or [train_leave] by rule
                        response_split = response_.split()
                        idx = response_split.index("[train_time]")
                        replaced = False
                        for token in response_split[:idx][::-1]:
                            if token in [
                                    "arrive", "arrives", "arrived", "arriving",
                                    "arrival", "destination", "reach", "by",
                                    "before", "have", "to"
                            ]:
                                response_split[idx] = "[train_arrive]"
                                replaced = True
                                break
                            elif token in [
                                    "leave", "leaves", "leaving", "leaved",
                                    "depart", "departing", "departs",
                                    "departure", "from", "after", "earlier",
                                    "there"
                            ]:
                                response_split[idx] = "[train_leave]"
                                replaced = True
                                break
                        if not replaced:
                            response_split[idx] = "[train_leave]"
                        response_ = " ".join(response_split)

                    turn["response_delex"] = response_

                    belief = {}
                    gate = {}
                    act = {}

                    for domain in dial_domains:  # active domains of dialogue
                        for slot, value in meta_data[domain]["book"].items(
                        ):  # book
                            if slot == "booked":
                                continue
                            slot, value = clean_slot_values(
                                domain, slot, value)
                            if value != "":
                                belief["{}-{}".format(domain, slot)] = value
                                gate["{}-{}".format(
                                    domain, slot
                                )] = ontology.gate_dict[
                                    value] if value == "dontcare" else ontology.gate_dict[
                                        "prediction"]
                        for slot, value in meta_data[domain]["semi"].items(
                        ):  # semi
                            slot, value = clean_slot_values(
                                domain, slot, value)
                            if value != "":
                                belief["{}-{}".format(domain, slot)] = value
                                gate["{}-{}".format(
                                    domain, slot
                                )] = ontology.gate_dict[
                                    value] if value == "dontcare" else ontology.gate_dict[
                                        "prediction"]
                    turn["belief"] = belief
                    turn["gate"] = gate

                    if acts.get(str(turn["turn_num"] + 1)) and type(
                            acts.get(str(turn["turn_num"] +
                                         1))) != str:  # mapping system action
                        for domain_act, slots in acts[str(turn["turn_num"] +
                                                          1)].items():
                            act_temp = []
                            for slot in slots:  # slot: [slot, value]
                                slot_, value_ = clean_slot_values(
                                    domain_act.split("-")[0], slot[0], slot[1])
                                if slot_ == "none" or value_ in [
                                        "?", "none"
                                ]:  # general domain or request slot or parking
                                    act_temp.append(slot_)
                                else:
                                    act_temp.append("{}-{}".format(
                                        slot_, value_))
                            act[domain_act.lower()] = act_temp
                    turn["action"] = act

                    dialogue["log"].append(turn)
                    turn = {}  # clear turn

            data[dial_id] = dialogue

        logger.info("Processing finished.")
        logger.info("Dividing data to train/dev/test...")
        for dial_id in self.train_list:
            dial_id = dial_id.split(".")[0]
            if dial_id not in ignore_list:
                train[dial_id] = data[dial_id]
        for dial_id in self.dev_list:
            dial_id = dial_id.split(".")[0]
            if dial_id not in ignore_list:
                dev[dial_id] = data[dial_id]
        for dial_id in self.test_list:
            dial_id = dial_id.split(".")[0]
            if dial_id not in ignore_list:
                test[dial_id] = data[dial_id]
        logger.info("Dividing finished.")

        value_ontology = json.load(
            open(os.path.join(self.data_path, "ontology.json"), "r"))
        value_ontology_processed = {}

        logger.info("Processing ontology...")
        for domain_slot, values in value_ontology.items():
            domain = domain_slot.split("-")[0]
            slot = domain_slot.split("-")[2].lower()
            if ontology.normlize_slot_names.get(slot):
                slot = ontology.normlize_slot_names[slot]
            domain_slot = "-".join([domain, slot])
            value_ontology_processed[domain_slot] = []
            for value in values:
                _, value = clean_slot_values(domain, slot, value)
                value_ontology_processed[domain_slot].append(value)
        with open(os.path.join(data_path, "ontology_processed.json"),
                  "w") as f:
            json.dump(value_ontology_processed, f, indent=2)
        logger.info("Ontology was processed.")

        return train, dev, test

示例#5

0

显示文件

文件： app.py 项目： PrasadRaoJammuna/twitter_app

def sentiment():

    if request.method == "POST":
        search_term = request.form['search_term']
        count = int(request.form['count'])
        tweets = tweepy.Cursor(api.search, q=search_term).items(count)

        positive = 0
        negative = 0
        neutral = 0
        polarity = 0

        def percentage(part, whole):
            return 100 * float(part) / float(whole)

        for data in tweets:
            tweet = data.text
            tweet = clean_text(tweet)

            analysis = TextBlob(tweet)
            polarity += analysis.sentiment.polarity

            if analysis.sentiment.polarity == 0.00:
                neutral += 1
            elif analysis.sentiment.polarity > 0.00:
                positive += 1
            elif analysis.sentiment.polarity < 0.00:
                negative += 1

        positivity = percentage(positive, count)
        negativity = percentage(negative, count)
        neutrality = percentage(neutral, count)
        reaction = polarity / count

        positive = format(positivity, '.2f')
        negative = format(negativity, '.2f')
        neutral = format(neutrality, '.2f')

        labels = [
            'Positive[' + str(positivity) + '%]',
            'Negative[' + str(negativity) + '%]',
            'Neutral[' + str(neutrality) + '%]'
        ]

        sizes = [positivity, negativity, neutrality]
        colors = ['green', 'red', 'orange']
        fig, ax = plt.subplots()
        ax.pie(
            sizes,
            colors=colors,
            labels=labels,
            autopct='%1.1f%%',
            startangle=90,
            pctdistance=0.85,
        )
        inner_circle = plt.Circle((0, 0), 0.70, fc='white')

        fig = plt.gcf()
        fig.gca().add_artist(inner_circle)
        ax.axis('equal')
        plt.title('People reaction on ' + "'" + search_term.capitalize() +
                  "'" + ' by analyzing ' + str(count) + ' Tweets')
        #ax.set_title("Course Attendance\n",fontsize=24)
        plt.tight_layout()
        canvas = FigureCanvas(fig)
        img = BytesIO()
        fig.savefig(img)
        img.seek(0)
        return send_file(img, mimetype='image/png')