def __init__(self, application_name, config_file_name): self.__parse_config(config_file_name, application_name) self.mongo = MongoDb(self.collection_name_prefix) self.morph = pymorphy2.MorphAnalyzer() self.aspects = set() self.id_to_opinions = dict() self.review_docs = []
def __init__(self, application_name, config_file_name): self.__parse_config(config_file_name, application_name) self.mongo = MongoDb(self.collection_name_prefix) self.review_docs = [] self.sentiment_dictionary = dict() self.modified_review_docs = [] self.aspects_statistics = AspectsStatistics()
def __init__(self, ru_wordnet_affect_positive_files, ru_wordnet_affect_negative_files, rusentilex_file): self.ru_wordnet_affect_positive_files = ru_wordnet_affect_positive_files self.ru_wordnet_affect_negative_files = ru_wordnet_affect_negative_files self.rusentilex_file = rusentilex_file # self.adjective_to_polarity = dict() self.sentiment_docs = [] self.mongo = MongoDb("") self.morph = MorphAnalyzer() self.already_in_docs = set()
class SentimentDictionaryCreator: def __init__(self, ru_wordnet_affect_positive_files, ru_wordnet_affect_negative_files, rusentilex_file): self.ru_wordnet_affect_positive_files = ru_wordnet_affect_positive_files self.ru_wordnet_affect_negative_files = ru_wordnet_affect_negative_files self.rusentilex_file = rusentilex_file # self.adjective_to_polarity = dict() self.sentiment_docs = [] self.mongo = MongoDb("") self.morph = MorphAnalyzer() self.already_in_docs = set() def create(self): self.__process_ru_wordnet_affect() self.__process_rusentilex() def __process_ru_wordnet_affect(self): for file_name in self.ru_wordnet_affect_positive_files: self.__process_file_with_mark(file_name, 'positive') for file_name in self.ru_wordnet_affect_negative_files: self.__process_file_with_mark(file_name, 'negative') def __process_file_with_mark(self, file_name, mark): with open(file_name, 'r', encoding='utf-8-sig') as file: for line in file: if '\t' in line: rus_words = line.split('\t')[3] for word in rus_words.split(): morphed = self.morph.parse(word)[0] pos = morphed.tag.POS if pos == 'INFN' or (pos == 'NOUN' and morphed.score >= 0.5): continue word = word.replace('_', ' ') if word not in self.already_in_docs: # self.adjective_to_polarity[word.replace('_', ' ')] = mark self.sentiment_docs.append({'word': word, 'polarity': mark}) self.already_in_docs.add(word) def __process_rusentilex(self): with open(self.rusentilex_file, 'r', encoding='utf-8-sig') as file: for i in range(18): file.readline() for line in file: words = [w.strip() for w in line.split(',')] pos = words[1] lemma = words[2] polarity = words[3] source = words[4] # if pos == 'Adj' and lemma not in self.adjective_to_polarity.keys() and source == 'opinion': # self.adjective_to_polarity[lemma] = polarity if pos == 'Adj' and source == 'opinion' and lemma not in self.already_in_docs: self.sentiment_docs.append({'word': lemma, 'polarity': polarity}) self.already_in_docs.add(lemma) def write_sentiment_dictionary(self): self.mongo.write_sentiment_dictionary(self.sentiment_docs)
def connMongoDb(): global cf conn = None; username = cf.get("MONGO",'username') password = cf.get("MONGO",'password') if(username != '' and password != ''): conn = mongo.connect(cf.get("MONGO","ip"),int(cf.get("MONGO","port"))) mongo_auth = conn.admin mongo_auth.authenticate(username, password) return conn return mongo.connect(cf.get("MONGO","ip"),int(cf.get("MONGO","port")))
class AspectsExtractor: def __init__(self, application_name, config_file_name): self.__parse_config(config_file_name, application_name) self.mongo = MongoDb(self.collection_name_prefix) self.transaction_docs = [] self.aspect_docs = [] def __parse_config(self, config_file_name, application_name): self.config = configparser.ConfigParser() self.config.read(config_file_name, encoding='utf-8') self.collection_name_prefix = self.config[application_name][ 'collection_name_prefix'] self.application_label = self.config[application_name][ 'application_label'] def load_transactions(self): self.transaction_docs = self.mongo.load_transactions() def extract(self): transactions_for_te = [] for doc in self.transaction_docs: transaction = doc["transaction"] transactions_for_te.append(transaction.split()) te = TransactionEncoder() oht_ary = te.fit(transactions_for_te).transform(transactions_for_te, sparse=True) sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_) if self.__settings_for_app_exist(): minsup = self.__get_minsup_fraction() else: minsup = 0.1 self.mongo.save_new_settings(self.application_label, self.collection_name_prefix, 10, 2) df_aspects = apriori(sparse_df, min_support=minsup, use_colnames=True, max_len=1) self.aspect_docs = [] for i in df_aspects.index: aspect = ' '.join(list(df_aspects.loc[i, 'itemsets'])) self.aspect_docs.append({ 'aspect': aspect, 'support': df_aspects.loc[i, 'support'] }) def __settings_for_app_exist(self): return self.mongo.find_settings(self.application_label) != 0 def __get_minsup_fraction(self): doc = self.mongo.load_minimum_support(self.application_label) return float(doc['minsup']) / 100 def write_aspects(self): self.mongo.write_aspects(self.aspect_docs)
def main(): img_path = sys.argv[1] floorplan_id = sys.argv[2] pixels_per_foot = int(sys.argv[3]) save_file_path = sys.argv[4] data = DataGridMaker.DataGrid(img_path, pixels_per_foot) MongoDb.insert_dimensions(floorplan_id, data.width, data.height) # write to file f = open(save_file_path, 'w') for i in data.map: for j in i: val = " " if j else "x" f.write(val) f.write('\n') f.close() return 0
class SentimentAnalyzer: def __init__(self, application_name, config_file_name): self.__parse_config(config_file_name, application_name) self.mongo = MongoDb(self.collection_name_prefix) self.review_docs = [] self.sentiment_dictionary = dict() self.modified_review_docs = [] self.aspects_statistics = AspectsStatistics() def __parse_config(self, config_file_name, application_name): self.config = configparser.ConfigParser() self.config.read(config_file_name, encoding='utf-8') self.collection_name_prefix = self.config[application_name]['collection_name_prefix'] def load_reviews(self): self.review_docs = self.mongo.load_reviews() def load_sentiment_dictionary_docs(self): sentiment_dictionary_docs = self.mongo.load_sentiment_dictionary() for doc in sentiment_dictionary_docs: self.sentiment_dictionary[doc['word']] = doc['polarity'] def analyze(self): known_sentiments = set(self.sentiment_dictionary.keys()) self.aspects_statistics = AspectsStatistics() for review_doc in self.review_docs: id = review_doc["_id"] already_counted_opinion = set() if 'opinions' not in review_doc.keys(): continue for opinion_obj in review_doc['opinions']: if 'sentiment' not in opinion_obj.keys(): continue aspect = opinion_obj['aspect'] sentiment = opinion_obj['sentiment'] if sentiment in known_sentiments: polarity = self.sentiment_dictionary[sentiment] opinion_obj['polarity'] = polarity if aspect + sentiment not in already_counted_opinion: self.aspects_statistics.consider(aspect, polarity) already_counted_opinion.add(aspect + sentiment) else: # opinion_obj['polarity'] = '' if aspect + sentiment not in already_counted_opinion: self.aspects_statistics.consider(aspect, "unknown") already_counted_opinion.add(aspect + sentiment) self.modified_review_docs.append(review_doc) def write(self): self.mongo.write_polarities(self.modified_review_docs) self.mongo.write_aspects_statistics(self.aspects_statistics.get_documents())
def __init__(self, application_name, config_file_name): self.page_to_parse_index = 0 self.__parse_config(config_file_name, application_name) self.__create_driver() self.mongo = MongoDb(self.collection_name_prefix)
class GooglePlayScraper: def __init__(self, application_name, config_file_name): self.page_to_parse_index = 0 self.__parse_config(config_file_name, application_name) self.__create_driver() self.mongo = MongoDb(self.collection_name_prefix) def __parse_config(self, config_file_name, application_name): self.config = configparser.ConfigParser() self.config.read(config_file_name, encoding='utf-8') self.url_to_scrap = self.config[application_name]['url_to_scrap'] self.collection_name_prefix = self.config[application_name]['collection_name_prefix'] self.sleep_in_seconds = int(self.config['Scraper']['sleep_in_seconds']) self.page_to_scrap_count = int(self.config['Scraper']['page_to_scrap_count']) def __create_driver(self): chrome_options = Options() chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--headless') self.driver = webdriver.Chrome(options=chrome_options) def skip_first_pages(self, pages_count): for i in range(pages_count): self.page_to_parse_index += 1 self.__delete_reviews() self.__trigger_next_page() if self.__check_button_existence(): self.__click_button() time.sleep(self.sleep_in_seconds) def __delete_reviews(self): self.driver.execute_script( 'var root = document.querySelector("[jsname=fk8dgd]"); \ while (root.firstChild) { \ root.removeChild(root.firstChild); \ }' ) def __trigger_next_page(self): self.driver.execute_script('window.scrollBy(0, -20)') self.driver.execute_script( 'window.scrollTo({top: document.body.scrollHeight, behavior: "smooth"})' ) def __check_button_existence(self): try: self.driver.find_element_by_xpath("//span[contains(./text(), 'Показать ещё')]/../..") except NoSuchElementException: return False return True def __click_button(self): btn = self.driver.find_element_by_xpath("//span[contains(./text(), 'Показать ещё')]/../..") self.driver.execute_script("arguments[0].click();", btn) def scrap(self): self.driver.get(self.url_to_scrap) for i in range(self.page_to_scrap_count): self.page_to_parse_index += 1 reviews = self.__get_reviews_containers() if len(reviews) == 40: print("OK: {0} out of 40".format(len(reviews))) else: print("WARN: {0} out of 40".format(len(reviews))) reviews_data = self.__get_reviews_data(reviews) self.mongo.write_reviews(reviews_data) self.__delete_reviews() self.__trigger_next_page() if self.__check_button_existence(): self.__click_button() time.sleep(self.sleep_in_seconds) self.driver.quit() def __get_reviews_containers(self): try: reviews = self.driver.find_elements_by_xpath("//div[contains(@jsname, 'fk8dgd')]/div") except Exception as e: print("[ERROR] ошибка при получении отзывов со страницы: {0}\n".format(str(e))) reviews = [] return reviews def __get_reviews_data(self, reviews): reviews_data = [] for review in reviews: name = review.find_element_by_xpath(".//span[contains(@class, 'X43Kjb')]").text score = int(review.find_element_by_xpath( ".//div[contains(@role, 'img')]" ).get_attribute("aria-label").split()[2]) likes = self.__get_likes(review) date = self.__get_date(review) has_photo = '/photo.jpg' not in review.find_element_by_xpath( ".//img[contains(@class, 'T75of ZqMJr')]" ).get_attribute("src") text = self.__get_text(review) reviews_data.append({ 'name': name, 'score': score, 'date': date, 'likes': likes, 'has_photo': has_photo, 'text': text }) return reviews_data def __get_likes(self, review): likes = review.find_element_by_xpath(".//div[contains(@class, 'jUL89d y92BAb')]").text if likes == '': likes = 0 else: likes = int(likes) return likes def __get_date(self, review): date_splitted_str = review.find_element_by_xpath(".//span[contains(@class, 'p2TkOb')]").text.split() day = date_splitted_str[0] year = date_splitted_str[2] month_str = date_splitted_str[1] month_str_to_num = {'января': '01', 'февраля': '02', 'марта': '03', 'апреля': '04', 'мая': '05', 'июня': '06', 'июля': '07', 'августа': '08', 'сентября': '09', 'октября': '10', 'ноября': '11', 'декабря': '12'} month = month_str_to_num[month_str] return year + '-' + month + '-' + day def __get_text(self, review): text = review.find_element_by_xpath(".//span[contains(@jsname, 'fbQN7e')]").get_attribute('textContent') if text == '': text = review.find_element_by_xpath(".//span[contains(@jsname, 'bN97Pc')]").text return text
def __init__(self, application_name, config_file_name): self.__parse_config(config_file_name, application_name) self.mongo = MongoDb(self.collection_name_prefix) self.transaction_docs = [] self.aspect_docs = []
class TransactionsManager: def __init__(self, application_name, config_file_name): self.__parse_config(config_file_name, application_name) self.mongo = MongoDb(self.collection_name_prefix) self.transaction_docs = [] self.review_docs = [] def __parse_config(self, config_file_name, application_name): self.config = configparser.ConfigParser() self.config.read(config_file_name, encoding='utf-8') self.collection_name_prefix = self.config[application_name][ 'collection_name_prefix'] def load_reviews(self): for e in self.mongo.load_reviews(): self.review_docs.append(e) def create_transactions(self): morph = pymorphy2.MorphAnalyzer() self.transaction_docs = [] for doc in self.review_docs: transaction = [] id = doc['_id'] text = doc['text'] clear_text = re.sub('[^А-Яа-яё0-9]+', ' ', text).lower() for word in clear_text.split(): morphed = morph.parse(word)[0] if morphed.tag.POS == 'NOUN': transaction.append(morphed.normal_form) self.transaction_docs.append({ '_id': id, 'transaction': ' '.join(transaction) }) def write_transactions(self): self.mongo.write_transactions(self.transaction_docs) def create_transactions_advanced(self): morph = pymorphy2.MorphAnalyzer() self.transaction_docs = [] for doc in self.review_docs: transaction = [] id = doc['_id'] text = doc['text'] clear_text = re.sub('[^А-Яа-яё0-9]+', ' ', text).lower() for sentence in nltk.sent_tokenize(clear_text, language='russian'): bigrams = list( nltk.bigrams( nltk.word_tokenize(sentence.lower(), language='russian'))) for first, second in bigrams: first_morphed = morph.parse(first)[0] second_morphed = morph.parse(second)[0] morphed = [first_morphed, second_morphed] if TransactionsManager.__is_noun_phrase(morphed): adj = morphed[0].inflect( {'sing', morphed[1].tag.gender})[0] transaction.append(' '.join([adj, morphed[1][0]])) elif first_morphed.tag.POS == 'NOUN': transaction.append(first_morphed.normal_form) if len(bigrams) > 0: last_morphed = morph.parse(bigrams[len(bigrams) - 1][1])[0] if last_morphed.normal_form == 'NOUN': transaction.append(last_morphed.normal_form) self.transaction_docs.append({ "_id": id, "transaction": ';'.join(set(transaction)) }) @staticmethod def __is_noun_phrase(words): return words[0].tag.POS == 'ADJF' and words[1].tag.POS == 'NOUN' \ and words[0].tag.case == words[1].tag.case \ and words[0].tag.gender == words[1].tag.gender
class OpinionsExtractor: def __init__(self, application_name, config_file_name): self.__parse_config(config_file_name, application_name) self.mongo = MongoDb(self.collection_name_prefix) self.morph = pymorphy2.MorphAnalyzer() self.aspects = set() self.id_to_opinions = dict() self.review_docs = [] def __parse_config(self, config_file_name, application_name): self.config = configparser.ConfigParser() self.config.read(config_file_name, encoding='utf-8') self.collection_name_prefix = self.config[application_name][ 'collection_name_prefix'] self.application_label = self.config[application_name][ 'application_label'] def load_aspects(self): aspect_docs = self.mongo.load_aspects() for doc in aspect_docs: self.aspects.add(doc["aspect"]) def load_reviews(self): self.review_docs = self.mongo.load_reviews() def extract(self): window_radius = self.__get_window_radius() stopwords = set(nltk.corpus.stopwords.words('russian')) stopwords.remove('не') tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') self.id_to_opinions = dict() for doc in self.review_docs: id = doc["_id"] text = doc["text"] for sentence in nltk.sent_tokenize(text.lower(), language='russian'): words = tokenizer.tokenize( sentence ) # words = nltk.word_tokenize(sentence, language='russian') filtered_words = [] for w in words: if w not in stopwords: filtered_words.append(w) # self.__process_sentence(id, filtered_words) for j in range(0, len(filtered_words)): morphed = self.morph.parse(filtered_words[j])[0] if morphed.tag.POS == 'NOUN' and morphed.normal_form in self.aspects: self.__scan_window_for_sentiment( id, filtered_words, j, window_radius) def __get_window_radius(self): doc = self.mongo.load_window_radius(self.application_label) return int(doc['window']) def __process_sentence(self, id, words): morphed_adjective_pos_inverse_flag_tuples = [] morphed_aspect_pos_pairs = [] for i in range(len(words)): word = words[i] for morphed in self.morph.parse(word)[:2]: if morphed.tag.POS == 'ADJF': # or morphed.tag.POS == 'ADJS': is_inversed = self.__is_inversed_polarity(words, i) morphed_adjective_pos_inverse_flag_tuples.append( (morphed, i, is_inversed)) break elif morphed.tag.POS == 'NOUN' and morphed.normal_form in self.aspects: morphed_aspect_pos_pairs.append((morphed, i)) break sentiment_words_and_inverse_flag = [] for (morphed, _, is_inversed) in morphed_adjective_pos_inverse_flag_tuples: if is_inversed: sentiment_words_and_inverse_flag.append({ 'word': morphed.normal_form, 'is_inversed': is_inversed }) else: sentiment_words_and_inverse_flag.append( {'word': morphed.normal_form}) adjectives_normal_forms_length = len(sentiment_words_and_inverse_flag) window_radius = self.__get_window_radius() for (morphed_aspect, aspect_pos) in morphed_aspect_pos_pairs: left_pos = max(0, aspect_pos - window_radius) right_pos = min(aspect_pos + window_radius, len(words) - 1) min_dist = 9999 effective_sentiment = '' for (morphed_adjective, adjective_pos, _) in morphed_adjective_pos_inverse_flag_tuples: if adjective_pos < left_pos: continue elif adjective_pos > right_pos: break else: dist = abs(aspect_pos - adjective_pos) if self.__are_adjective_and_aspect_match( morphed_adjective, morphed_aspect, dist, min_dist): min_dist = dist effective_sentiment = morphed_adjective.normal_form doc = {'aspect': morphed_aspect.normal_form} if effective_sentiment != '': doc['effective_sentiment'] = effective_sentiment if adjectives_normal_forms_length != 0: doc['sentiment_words'] = sentiment_words_and_inverse_flag if id in self.id_to_opinions.keys(): self.id_to_opinions[id].append(doc) else: self.id_to_opinions[id] = [doc] def __is_inversed_polarity(self, words, adj_pos): if adj_pos == 0: return False return words[adj_pos - 1] == 'не' def __are_adjective_and_aspect_match(self, morphed_adjective, morphed_aspect, dist, min_dist): return (morphed_adjective.tag.POS == 'ADJF' and morphed_adjective.tag.case == morphed_aspect.tag.case or morphed_adjective.tag.POS == 'ADJS') and dist < min_dist def __scan_window_for_sentiment(self, id, sentence_words, aspect_position, window_radius): left_i = max(0, aspect_position - window_radius) right_i = min(aspect_position + window_radius, len(sentence_words) - 1) min_distance = 1000 sentiment = '' aspect_morphed = self.morph.parse(sentence_words[aspect_position])[0] for i in range(left_i, right_i + 1): if i == aspect_position: continue for morphed in self.morph.parse(sentence_words[i])[:2]: if self.__are_words_consistent(morphed, aspect_morphed, i, aspect_position, min_distance): min_distance = abs(i - aspect_position) sentiment = morphed.normal_form break doc = {'aspect': aspect_morphed.normal_form} if sentiment != '': doc['sentiment'] = sentiment if id in self.id_to_opinions.keys(): self.id_to_opinions[id].append(doc) else: self.id_to_opinions[id] = [doc] def __are_words_consistent_1(self, morphed, aspect_morphed, word_position, aspect_position, min_distance): return (morphed.tag.POS == 'ADJF' and morphed.tag.case == aspect_morphed.tag.case or morphed.tag.POS == 'ADJS') and abs(word_position - aspect_position) < min_distance\ # and morphed.tag.number == aspect_morphed.tag.number \ # and morphed.tag.gender == aspect_morphed.tag.gender \ # and abs(word_position - aspect_position) < min_distance def __are_words_consistent(self, morphed, aspect_morphed, word_position, aspect_position, min_distance): if (word_position - aspect_position) >= min_distance: return False pos = morphed.tag.POS score = 0 if pos == 'ADJF' or pos == 'ADJS' or pos == 'PRTF': if morphed.tag.case == aspect_morphed.tag.case: score += 1 if morphed.tag.number == aspect_morphed.tag.number: score += 1 if morphed.tag.gender == aspect_morphed.tag.gender: score += 1 return score >= 2 return False def write_opinions(self): self.mongo.write_opinions(self.id_to_opinions)