Exemplos de Preprocessor.basic_preprocess em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: preprocess

Classe / Tipo: Preprocessor

Método / Função: basic_preprocess

Exemplos em hotexamples.com: 1

Preprocessor.basic_preprocess em Python - 1 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de preprocess.Preprocessor.basic_preprocess em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

Preprocessor(30)

add(4)

execute(3)

load(3)

import_video(3)

get_vocabulary(2)

get_states(2)

get_standard_form(2)

get_representer(2)

gen_data_vec(2)

setNextPitchCorner(2)

count_lines(1)

bgsub(1)

load_data(1)

_line_cleanup(1)

lda(1)

investigate_whitelist(1)

index_list_to_word_list(1)

apply(1)

basic_preprocess(1)

get_values_all(1)

get_training_data(1)

get_train_test_data_tag(1)

get_testing_data(1)

get_target_names(1)

build_vocab(1)

convert_text_to_index(1)

build_vocabulary_and_categories(1)

get_feature_names(1)

get_data(1)

get_all_text(1)

_clean_data(1)

getSentences(1)

generateTrainData(1)

convert_index_to_text(1)

gaussian(1)

format_to_nn(1)

format_to_lines(1)

fit_on_corpus(1)

get_all_tag_idx(1)

Métodos Frequentes

Preprocessor (30)

add (4)

execute (3)

load (3)

import_video (3)

get_vocabulary (2)

get_states (2)

get_standard_form (2)

get_representer (2)

gen_data_vec (2)

Métodos Frequentes

setNextPitchCorner (2)

count_lines (1)

bgsub (1)

load_data (1)

_line_cleanup (1)

lda (1)

investigate_whitelist (1)

index_list_to_word_list (1)

apply (1)

basic_preprocess (1)

get_values_all (1)

get_training_data (1)

get_train_test_data_tag (1)

get_testing_data (1)

get_target_names (1)

build_vocab (1)

convert_text_to_index (1)

build_vocabulary_and_categories (1)

get_feature_names (1)

get_data (1)

Métodos Frequentes

get_values_all (1)

get_training_data (1)

get_train_test_data_tag (1)

get_testing_data (1)

get_target_names (1)

build_vocab (1)

convert_text_to_index (1)

build_vocabulary_and_categories (1)

get_feature_names (1)

get_data (1)

get_all_text (1)

_clean_data (1)

getSentences (1)

generateTrainData (1)

convert_index_to_text (1)

gaussian (1)

format_to_nn (1)

format_to_lines (1)

fit_on_corpus (1)

get_all_tag_idx (1)

Métodos Frequentes

get_all_text (1)

_clean_data (1)

getSentences (1)

generateTrainData (1)

convert_index_to_text (1)

gaussian (1)

format_to_nn (1)

format_to_lines (1)

fit_on_corpus (1)

get_all_tag_idx (1)

Exemplo n.º 1

0

Exibir arquivo

class SentimentAnalyzer(): # @param safolder folder where the training set resides, and where to put the test result # @param model algorithm used for sentiment analysis def __init__(self, dir, model): self.dir = dir self.model = model self.preprocessor = Preprocessor(dir) if not os.path.exists(f"{self.dir}/training/model"): os.makedirs(f"{self.dir}/training/model") # open existing files if a model has been built before. no need to reprocess self.classifier = pickle.load(open(f"{self.dir}/training/model/{model}_clf.pickle", "rb")) \ if os.path.isfile(f"{self.dir}/training/model/{model}_clf.pickle") else None self.features = pickle.load(open(f"{self.dir}/training/model/features.pickle", "rb")) \ if os.path.isfile(f"{self.dir}/training/model/features.pickle") else [] self.classes = pickle.load(open(f"{self.dir}/training/model/classes.pickle", "rb")) \ if os.path.isfile(f"{self.dir}/training/model/classes.pickle") else [] self.most_common_words = [w.strip() for w in open(f"{self.dir}/training/model/most_common_words.txt","r",encoding="utf8").readlines()] \ if os.path.isfile(f"{self.dir}/training/model/most_common_words.txt") else [] # remove most common words (top 1%) that appear in both positive and negative documents def _remove_most_common_words(self, documents): logging.info("Define most common words...") most_common_words = set([]) for cls in self.classes: docs = [d for d in documents if d.sentiment == cls] doc_words = [ w for d in docs for w in word_tokenize(d.content.replace(".", "")) ] fdist = nltk.FreqDist(doc_words) if len(most_common_words) == 0: most_common_words = set([ w[0] for w in fdist.most_common(int(0.01 * len(doc_words))) ]) else: most_common_words = set([ w[0] for w in fdist.most_common(int(0.01 * len(doc_words))) if w[0] in most_common_words ]) self.most_common_words = most_common_words with open(f"{self.dir}/training/model/most_common_words.txt", "w", encoding="utf8") as writer: writer.writelines([f"{w}\n" for w in self.most_common_words]) ndocs = [] doc_count = 0 for d in documents: doc_count += 1 ncontent = " ".join([ w for w in word_tokenize(d.content) if w not in most_common_words ]) ndocs.append(Document(d.name, ncontent, d.sentiment, d.location)) logging.info("Removing most common words progress", int(doc_count / len(documents) * 100), "%") return ndocs # only keep adjectives, adverbs, and nouns def _reduce_dimension_by_postag(self, documents): reduced_documents = [] doc_count = 0 for doc in documents: reduced_sentence = " ".join([ p[0] for p in nltk.pos_tag( word_tokenize(doc.content.replace(".", ""))) if p[1] in preprocess.ADJ or p[1] in preprocess.ADV or p[1] in preprocess.NOUN ]) if not reduced_sentence.isspace(): reduced_documents.append( Document(doc.name, reduced_sentence, doc.sentiment, doc.location)) doc_count += 1 logging.info("Reducing dimension in progress", int(doc_count * 100 / len(documents)), "%") return reduced_documents def create_frequency_plot(self, words, top_k): p = nltk.FreqDist(words) p.plot(top_k) def _undersample(self, documents): # find the minimum number of documents in a class docs_by_class = [] minclass_length = len(documents) for cls in self.classes: docs = [d for d in documents if d.sentiment == cls] docs_by_class.append(docs) if len(docs) < minclass_length: minclass_length = len(docs) # sample all classes based on the minimum number of documents undersampled_docs = [] for docs in docs_by_class: random.shuffle(docs) undersampled_docs.extend(docs[:minclass_length]) return undersampled_docs # preprocessing def prepare_documents(self): documents = [] for file in os.listdir(f"{self.dir}/training/data"): documents.extend( pickle.load(open(f"{self.dir}/training/data/{file}", "rb"))) if len(self.classes) == 0: self.classes = set([doc.sentiment for doc in documents]) pickle.dump( self.classes, open(f"{self.dir}/training/model/classes.pickle", "wb")) logging.info("Perform undersampling...") documents = self._undersample(documents) documents = self._reduce_dimension_by_postag(documents) documents = self._remove_most_common_words(documents) return documents def transform_into_featuresets(self, documents): self.features = set( [w for d in documents for w in set(word_tokenize(d.content))]) pickle.dump(self.features, open(f"{self.dir}/training/model/features.pickle", "wb")) logging.info("Features length:", len(self.features)) featuresets = [] logging.info("Transforming into featuresets....") doc_count = 0 for doc in documents: # checking whether a word exists in an array takes a significantly longer time # thus we check whether a word exists in a string featuresets.append(({ w: True for w in word_tokenize(doc.content) if w in self.features }, doc.sentiment)) doc_count += 1 logging.info("Preparing featureset in progress", int(doc_count * 100 / len(documents)), "%") return featuresets def get_training_validation_set(self, featuresets, valid_ratio): if len(self.classes) == 0: classes = set([f[1] for f in featuresets]) pickle.dump( self.classes, open(f"{self.dir}/training/model/classes.pickle", "wb")) trainingset = [] validset = [] for c in self.classes: subfeat = [f for f in featuresets if f[1] == c] random.shuffle(subfeat) trainct = int((1 - valid_ratio) * len(subfeat)) trainingset.extend(subfeat[:trainct]) validset.extend(subfeat[trainct:]) return trainingset, validset def train(self, validation_ratio): os.makedirs(os.path.dirname(f"{self.dir}/training/model/"), exist_ok=True) documents = self.prepare_documents() featuresets = self.transform_into_featuresets(documents) trainset, validset = self.get_training_validation_set( featuresets, validation_ratio) logging.info("Building classifier...") if self.model == "NB": self.classifier = nltk.NaiveBayesClassifier.train(trainset) self.classifier.show_most_informative_features(15) else: logging.error("Model does not exist") sys.exit() ''' #Models can be expanded elif self.model == "MNB": self.classifier = SklearnClassifier(MultinomialNB()).train(trainset) elif self.model == "SVM": self.classifier = SklearnClassifier(SVC()).train(trainset) elif self.model == "LR": self.classifier = SklearnClassifier(LogisticRegression()).train(trainset) ''' logging.info("Accuracy per class") for cls in self.classes: logging.info(f"{cls} accuracy:", (nltk.classify.accuracy( self.classifier, [v for v in validset if v[1] == cls])) * 100) logging.info("Classifier accuracy percent:", (nltk.classify.accuracy(self.classifier, validset)) * 100) pickle.dump( self.classifier, open(f"{self.dir}/training/model/{self.model}_clf.pickle", "wb")) def show_most_informative_features(self, n): self.classifier.show_most_informative_features(n) def sentiment(self, text): # to ensure that the word is lemmatized properly so it is detected in self.features cleaned_text = self.preprocessor.basic_preprocess(text).replace( ".", "") # no need advanced self processing because the features have been determined feature = { w: True for w in word_tokenize(cleaned_text) if w in self.features } prob_dict = self.classifier.prob_classify(feature) cls = prob_dict.max() prob = prob_dict.prob(cls) return cls, prob def classify(self): logging.info("Start classifying...") if not os.path.exists(f"{self.dir}/testing/results"): os.makedirs(f"{self.dir}/testing/results") if self.classifier == None: self.train(0.2) else: self.classifier.show_most_informative_features(15) files = [ os.path.basename(x) for x in glob.glob(f"{self.dir}/testing/data/*.csv") ] done_files = [f.strip() for f in open(f"{self.dir}/testing/classify_done.txt", 'r').readlines()] \ if os.path.isfile(f"{self.dir}/testing/classify_done.txt") else [] tbp_files = [f for f in files if f not in done_files] headers = [ "review_page", "review_title", "review_content", "review_star", "reviewer_location", "review_date", "crawled_date" ] os.makedirs(os.path.dirname(f"{self.dir}/testing/results/"), exist_ok=True) for file in tbp_files: with open(f"{self.dir}/testing/data/{file}", "r", encoding="utf8") as f: csvreader = csv.DictReader(f) with open(f"{self.dir}/testing/results/{file}","w", encoding="utf8", newline="") \ as w: csvwriter = csv.writer(w) csvwriter.writerow(headers) rowid = 0 rownum = self.preprocessor.count_lines( f"{self.dir}/{test_dir}/data/{file}") for row in csvreader: review_page = row["review_page"] review_title = row["review_title"] review_content = row["review_content"] cat = self.sentiment( f"{row['review_title']}. {row['review_content']}") review_star = "45" if cat[0] == "pos" else "20" reviewer_location = row["user_location"] review_date = row["review_date"] crawled_date = "00000000" csvwriter.writerow([ review_page, review_title, review_content, review_star, reviewer_location, review_date, crawled_date ]) w.flush() rowid += 1 logging.info("Classifying in progress", int(rowid * 100 / rownum), "% for", file) with open(f"{self.dir}/testing/classify_done.txt", "a", encoding="utf8") as writer: writer.write(f"{file}\n")