Пример #1
0
 def __init__(self, application_name, config_file_name):
     self.__parse_config(config_file_name, application_name)
     self.mongo = MongoDb(self.collection_name_prefix)
     self.morph = pymorphy2.MorphAnalyzer()
     self.aspects = set()
     self.id_to_opinions = dict()
     self.review_docs = []
Пример #2
0
 def __init__(self, application_name, config_file_name):
     self.__parse_config(config_file_name, application_name)
     self.mongo = MongoDb(self.collection_name_prefix)
     self.review_docs = []
     self.sentiment_dictionary = dict()
     self.modified_review_docs = []
     self.aspects_statistics = AspectsStatistics()
Пример #3
0
 def __init__(self, ru_wordnet_affect_positive_files, ru_wordnet_affect_negative_files, rusentilex_file):
     self.ru_wordnet_affect_positive_files = ru_wordnet_affect_positive_files
     self.ru_wordnet_affect_negative_files = ru_wordnet_affect_negative_files
     self.rusentilex_file = rusentilex_file
     # self.adjective_to_polarity = dict()
     self.sentiment_docs = []
     self.mongo = MongoDb("")
     self.morph = MorphAnalyzer()
     self.already_in_docs = set()
Пример #4
0
class SentimentDictionaryCreator:
    def __init__(self, ru_wordnet_affect_positive_files, ru_wordnet_affect_negative_files, rusentilex_file):
        self.ru_wordnet_affect_positive_files = ru_wordnet_affect_positive_files
        self.ru_wordnet_affect_negative_files = ru_wordnet_affect_negative_files
        self.rusentilex_file = rusentilex_file
        # self.adjective_to_polarity = dict()
        self.sentiment_docs = []
        self.mongo = MongoDb("")
        self.morph = MorphAnalyzer()
        self.already_in_docs = set()

    def create(self):
        self.__process_ru_wordnet_affect()
        self.__process_rusentilex()

    def __process_ru_wordnet_affect(self):
        for file_name in self.ru_wordnet_affect_positive_files:
            self.__process_file_with_mark(file_name, 'positive')
        for file_name in self.ru_wordnet_affect_negative_files:
            self.__process_file_with_mark(file_name, 'negative')

    def __process_file_with_mark(self, file_name, mark):
        with open(file_name, 'r', encoding='utf-8-sig') as file:
            for line in file:
                if '\t' in line:
                    rus_words = line.split('\t')[3]
                    for word in rus_words.split():
                        morphed = self.morph.parse(word)[0]
                        pos = morphed.tag.POS
                        if pos == 'INFN' or (pos == 'NOUN' and morphed.score >= 0.5):
                            continue
                        word = word.replace('_', ' ')
                        if word not in self.already_in_docs:
                            # self.adjective_to_polarity[word.replace('_', ' ')] = mark
                            self.sentiment_docs.append({'word': word, 'polarity': mark})
                            self.already_in_docs.add(word)

    def __process_rusentilex(self):
        with open(self.rusentilex_file, 'r', encoding='utf-8-sig') as file:
            for i in range(18):
                file.readline()
            for line in file:
                words = [w.strip() for w in line.split(',')]
                pos = words[1]
                lemma = words[2]
                polarity = words[3]
                source = words[4]
                # if pos == 'Adj' and lemma not in self.adjective_to_polarity.keys() and source == 'opinion':
                #     self.adjective_to_polarity[lemma] = polarity
                if pos == 'Adj' and source == 'opinion' and lemma not in self.already_in_docs:
                    self.sentiment_docs.append({'word': lemma, 'polarity': polarity})
                    self.already_in_docs.add(lemma)

    def write_sentiment_dictionary(self):
        self.mongo.write_sentiment_dictionary(self.sentiment_docs)
Пример #5
0
def connMongoDb():
    global cf
    conn = None;
    username = cf.get("MONGO",'username')
    password = cf.get("MONGO",'password') 
    if(username != '' and password != ''):
        conn = mongo.connect(cf.get("MONGO","ip"),int(cf.get("MONGO","port")))
        mongo_auth = conn.admin
        mongo_auth.authenticate(username,  password)
        return conn
    return mongo.connect(cf.get("MONGO","ip"),int(cf.get("MONGO","port")))
Пример #6
0
class AspectsExtractor:
    def __init__(self, application_name, config_file_name):
        self.__parse_config(config_file_name, application_name)
        self.mongo = MongoDb(self.collection_name_prefix)
        self.transaction_docs = []
        self.aspect_docs = []

    def __parse_config(self, config_file_name, application_name):
        self.config = configparser.ConfigParser()
        self.config.read(config_file_name, encoding='utf-8')
        self.collection_name_prefix = self.config[application_name][
            'collection_name_prefix']
        self.application_label = self.config[application_name][
            'application_label']

    def load_transactions(self):
        self.transaction_docs = self.mongo.load_transactions()

    def extract(self):
        transactions_for_te = []
        for doc in self.transaction_docs:
            transaction = doc["transaction"]
            transactions_for_te.append(transaction.split())

        te = TransactionEncoder()
        oht_ary = te.fit(transactions_for_te).transform(transactions_for_te,
                                                        sparse=True)
        sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary,
                                                      columns=te.columns_)

        if self.__settings_for_app_exist():
            minsup = self.__get_minsup_fraction()
        else:
            minsup = 0.1
            self.mongo.save_new_settings(self.application_label,
                                         self.collection_name_prefix, 10, 2)
        df_aspects = apriori(sparse_df,
                             min_support=minsup,
                             use_colnames=True,
                             max_len=1)

        self.aspect_docs = []
        for i in df_aspects.index:
            aspect = ' '.join(list(df_aspects.loc[i, 'itemsets']))
            self.aspect_docs.append({
                'aspect': aspect,
                'support': df_aspects.loc[i, 'support']
            })

    def __settings_for_app_exist(self):
        return self.mongo.find_settings(self.application_label) != 0

    def __get_minsup_fraction(self):
        doc = self.mongo.load_minimum_support(self.application_label)
        return float(doc['minsup']) / 100

    def write_aspects(self):
        self.mongo.write_aspects(self.aspect_docs)
def main():
    img_path = sys.argv[1]
    floorplan_id = sys.argv[2]
    pixels_per_foot = int(sys.argv[3])
    save_file_path = sys.argv[4]

    data = DataGridMaker.DataGrid(img_path, pixels_per_foot)
    MongoDb.insert_dimensions(floorplan_id, data.width, data.height)

    # write to file
    f = open(save_file_path, 'w')

    for i in data.map:
        for j in i:
            val = " " if j else "x"
            f.write(val)
        f.write('\n')

    f.close()

    return 0
Пример #8
0
class SentimentAnalyzer:
    def __init__(self, application_name, config_file_name):
        self.__parse_config(config_file_name, application_name)
        self.mongo = MongoDb(self.collection_name_prefix)
        self.review_docs = []
        self.sentiment_dictionary = dict()
        self.modified_review_docs = []
        self.aspects_statistics = AspectsStatistics()

    def __parse_config(self, config_file_name, application_name):
        self.config = configparser.ConfigParser()
        self.config.read(config_file_name, encoding='utf-8')
        self.collection_name_prefix = self.config[application_name]['collection_name_prefix']

    def load_reviews(self):
        self.review_docs = self.mongo.load_reviews()

    def load_sentiment_dictionary_docs(self):
        sentiment_dictionary_docs = self.mongo.load_sentiment_dictionary()
        for doc in sentiment_dictionary_docs:
            self.sentiment_dictionary[doc['word']] = doc['polarity']

    def analyze(self):
        known_sentiments = set(self.sentiment_dictionary.keys())
        self.aspects_statistics = AspectsStatistics()

        for review_doc in self.review_docs:
            id = review_doc["_id"]
            already_counted_opinion = set()
            if 'opinions' not in review_doc.keys():
                continue
            for opinion_obj in review_doc['opinions']:
                if 'sentiment' not in opinion_obj.keys():
                    continue
                aspect = opinion_obj['aspect']
                sentiment = opinion_obj['sentiment']
                if sentiment in known_sentiments:
                    polarity = self.sentiment_dictionary[sentiment]
                    opinion_obj['polarity'] = polarity
                    if aspect + sentiment not in already_counted_opinion:
                        self.aspects_statistics.consider(aspect, polarity)
                        already_counted_opinion.add(aspect + sentiment)
                else:
                    # opinion_obj['polarity'] = ''
                    if aspect + sentiment not in already_counted_opinion:
                        self.aspects_statistics.consider(aspect, "unknown")
                        already_counted_opinion.add(aspect + sentiment)
            self.modified_review_docs.append(review_doc)

    def write(self):
        self.mongo.write_polarities(self.modified_review_docs)
        self.mongo.write_aspects_statistics(self.aspects_statistics.get_documents())
Пример #9
0
 def __init__(self, application_name, config_file_name):
     self.page_to_parse_index = 0
     self.__parse_config(config_file_name, application_name)
     self.__create_driver()
     self.mongo = MongoDb(self.collection_name_prefix)
Пример #10
0
class GooglePlayScraper:
    def __init__(self, application_name, config_file_name):
        self.page_to_parse_index = 0
        self.__parse_config(config_file_name, application_name)
        self.__create_driver()
        self.mongo = MongoDb(self.collection_name_prefix)

    def __parse_config(self, config_file_name, application_name):
        self.config = configparser.ConfigParser()
        self.config.read(config_file_name, encoding='utf-8')
        self.url_to_scrap = self.config[application_name]['url_to_scrap']
        self.collection_name_prefix = self.config[application_name]['collection_name_prefix']
        self.sleep_in_seconds = int(self.config['Scraper']['sleep_in_seconds'])
        self.page_to_scrap_count = int(self.config['Scraper']['page_to_scrap_count'])

    def __create_driver(self):
        chrome_options = Options()
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--headless')
        self.driver = webdriver.Chrome(options=chrome_options)

    def skip_first_pages(self, pages_count):
        for i in range(pages_count):
            self.page_to_parse_index += 1
            self.__delete_reviews()
            self.__trigger_next_page()
            if self.__check_button_existence():
                self.__click_button()
            time.sleep(self.sleep_in_seconds)

    def __delete_reviews(self):
        self.driver.execute_script(
            'var root = document.querySelector("[jsname=fk8dgd]"); \
            while (root.firstChild) { \
                root.removeChild(root.firstChild); \
            }'
        )

    def __trigger_next_page(self):
        self.driver.execute_script('window.scrollBy(0, -20)')
        self.driver.execute_script(
            'window.scrollTo({top: document.body.scrollHeight, behavior: "smooth"})'
        )

    def __check_button_existence(self):
        try:
            self.driver.find_element_by_xpath("//span[contains(./text(), 'Показать ещё')]/../..")
        except NoSuchElementException:
            return False
        return True

    def __click_button(self):
        btn = self.driver.find_element_by_xpath("//span[contains(./text(), 'Показать ещё')]/../..")
        self.driver.execute_script("arguments[0].click();", btn)

    def scrap(self):
        self.driver.get(self.url_to_scrap)
        for i in range(self.page_to_scrap_count):
            self.page_to_parse_index += 1
            reviews = self.__get_reviews_containers()
            if len(reviews) == 40:
                print("OK: {0} out of 40".format(len(reviews)))
            else:
                print("WARN: {0} out of 40".format(len(reviews)))
            reviews_data = self.__get_reviews_data(reviews)
            self.mongo.write_reviews(reviews_data)
            self.__delete_reviews()
            self.__trigger_next_page()
            if self.__check_button_existence():
                self.__click_button()
            time.sleep(self.sleep_in_seconds)
        self.driver.quit()

    def __get_reviews_containers(self):
        try:
            reviews = self.driver.find_elements_by_xpath("//div[contains(@jsname, 'fk8dgd')]/div")
        except Exception as e:
            print("[ERROR] ошибка при получении отзывов со страницы: {0}\n".format(str(e)))
            reviews = []
        return reviews

    def __get_reviews_data(self, reviews):
        reviews_data = []
        for review in reviews:
            name = review.find_element_by_xpath(".//span[contains(@class, 'X43Kjb')]").text
            score = int(review.find_element_by_xpath(
                ".//div[contains(@role, 'img')]"
            ).get_attribute("aria-label").split()[2])
            likes = self.__get_likes(review)
            date = self.__get_date(review)
            has_photo = '/photo.jpg' not in review.find_element_by_xpath(
                ".//img[contains(@class, 'T75of ZqMJr')]"
            ).get_attribute("src")
            text = self.__get_text(review)
            reviews_data.append({
                'name': name, 'score': score, 'date': date, 'likes': likes, 'has_photo': has_photo, 'text': text
            })
        return reviews_data

    def __get_likes(self, review):
        likes = review.find_element_by_xpath(".//div[contains(@class, 'jUL89d y92BAb')]").text
        if likes == '':
            likes = 0
        else:
            likes = int(likes)
        return likes

    def __get_date(self, review):
        date_splitted_str = review.find_element_by_xpath(".//span[contains(@class, 'p2TkOb')]").text.split()
        day = date_splitted_str[0]
        year = date_splitted_str[2]
        month_str = date_splitted_str[1]
        month_str_to_num = {'января': '01', 'февраля': '02', 'марта': '03', 'апреля': '04',
                            'мая': '05', 'июня': '06', 'июля': '07', 'августа': '08',
                            'сентября': '09', 'октября': '10', 'ноября': '11', 'декабря': '12'}
        month = month_str_to_num[month_str]
        return year + '-' + month + '-' + day

    def __get_text(self, review):
        text = review.find_element_by_xpath(".//span[contains(@jsname, 'fbQN7e')]").get_attribute('textContent')
        if text == '':
            text = review.find_element_by_xpath(".//span[contains(@jsname, 'bN97Pc')]").text
        return text
Пример #11
0
 def __init__(self, application_name, config_file_name):
     self.__parse_config(config_file_name, application_name)
     self.mongo = MongoDb(self.collection_name_prefix)
     self.transaction_docs = []
     self.aspect_docs = []
Пример #12
0
class TransactionsManager:
    def __init__(self, application_name, config_file_name):
        self.__parse_config(config_file_name, application_name)
        self.mongo = MongoDb(self.collection_name_prefix)
        self.transaction_docs = []
        self.review_docs = []

    def __parse_config(self, config_file_name, application_name):
        self.config = configparser.ConfigParser()
        self.config.read(config_file_name, encoding='utf-8')
        self.collection_name_prefix = self.config[application_name][
            'collection_name_prefix']

    def load_reviews(self):
        for e in self.mongo.load_reviews():
            self.review_docs.append(e)

    def create_transactions(self):
        morph = pymorphy2.MorphAnalyzer()
        self.transaction_docs = []
        for doc in self.review_docs:
            transaction = []
            id = doc['_id']
            text = doc['text']
            clear_text = re.sub('[^А-Яа-яё0-9]+', ' ', text).lower()
            for word in clear_text.split():
                morphed = morph.parse(word)[0]
                if morphed.tag.POS == 'NOUN':
                    transaction.append(morphed.normal_form)
            self.transaction_docs.append({
                '_id': id,
                'transaction': ' '.join(transaction)
            })

    def write_transactions(self):
        self.mongo.write_transactions(self.transaction_docs)

    def create_transactions_advanced(self):
        morph = pymorphy2.MorphAnalyzer()
        self.transaction_docs = []
        for doc in self.review_docs:
            transaction = []
            id = doc['_id']
            text = doc['text']
            clear_text = re.sub('[^А-Яа-яё0-9]+', ' ', text).lower()
            for sentence in nltk.sent_tokenize(clear_text, language='russian'):
                bigrams = list(
                    nltk.bigrams(
                        nltk.word_tokenize(sentence.lower(),
                                           language='russian')))
                for first, second in bigrams:
                    first_morphed = morph.parse(first)[0]
                    second_morphed = morph.parse(second)[0]
                    morphed = [first_morphed, second_morphed]
                    if TransactionsManager.__is_noun_phrase(morphed):
                        adj = morphed[0].inflect(
                            {'sing', morphed[1].tag.gender})[0]
                        transaction.append(' '.join([adj, morphed[1][0]]))
                    elif first_morphed.tag.POS == 'NOUN':
                        transaction.append(first_morphed.normal_form)
                if len(bigrams) > 0:
                    last_morphed = morph.parse(bigrams[len(bigrams) - 1][1])[0]
                    if last_morphed.normal_form == 'NOUN':
                        transaction.append(last_morphed.normal_form)
            self.transaction_docs.append({
                "_id":
                id,
                "transaction":
                ';'.join(set(transaction))
            })

    @staticmethod
    def __is_noun_phrase(words):
        return words[0].tag.POS == 'ADJF' and words[1].tag.POS == 'NOUN' \
               and words[0].tag.case == words[1].tag.case \
               and words[0].tag.gender == words[1].tag.gender
Пример #13
0
class OpinionsExtractor:
    def __init__(self, application_name, config_file_name):
        self.__parse_config(config_file_name, application_name)
        self.mongo = MongoDb(self.collection_name_prefix)
        self.morph = pymorphy2.MorphAnalyzer()
        self.aspects = set()
        self.id_to_opinions = dict()
        self.review_docs = []

    def __parse_config(self, config_file_name, application_name):
        self.config = configparser.ConfigParser()
        self.config.read(config_file_name, encoding='utf-8')
        self.collection_name_prefix = self.config[application_name][
            'collection_name_prefix']
        self.application_label = self.config[application_name][
            'application_label']

    def load_aspects(self):
        aspect_docs = self.mongo.load_aspects()
        for doc in aspect_docs:
            self.aspects.add(doc["aspect"])

    def load_reviews(self):
        self.review_docs = self.mongo.load_reviews()

    def extract(self):
        window_radius = self.__get_window_radius()
        stopwords = set(nltk.corpus.stopwords.words('russian'))
        stopwords.remove('не')
        tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
        self.id_to_opinions = dict()

        for doc in self.review_docs:
            id = doc["_id"]
            text = doc["text"]
            for sentence in nltk.sent_tokenize(text.lower(),
                                               language='russian'):
                words = tokenizer.tokenize(
                    sentence
                )  # words = nltk.word_tokenize(sentence, language='russian')
                filtered_words = []
                for w in words:
                    if w not in stopwords:
                        filtered_words.append(w)
                # self.__process_sentence(id, filtered_words)
                for j in range(0, len(filtered_words)):
                    morphed = self.morph.parse(filtered_words[j])[0]
                    if morphed.tag.POS == 'NOUN' and morphed.normal_form in self.aspects:
                        self.__scan_window_for_sentiment(
                            id, filtered_words, j, window_radius)

    def __get_window_radius(self):
        doc = self.mongo.load_window_radius(self.application_label)
        return int(doc['window'])

    def __process_sentence(self, id, words):
        morphed_adjective_pos_inverse_flag_tuples = []
        morphed_aspect_pos_pairs = []

        for i in range(len(words)):
            word = words[i]
            for morphed in self.morph.parse(word)[:2]:
                if morphed.tag.POS == 'ADJF':  # or morphed.tag.POS == 'ADJS':
                    is_inversed = self.__is_inversed_polarity(words, i)
                    morphed_adjective_pos_inverse_flag_tuples.append(
                        (morphed, i, is_inversed))
                    break
                elif morphed.tag.POS == 'NOUN' and morphed.normal_form in self.aspects:
                    morphed_aspect_pos_pairs.append((morphed, i))
                    break

        sentiment_words_and_inverse_flag = []

        for (morphed, _,
             is_inversed) in morphed_adjective_pos_inverse_flag_tuples:
            if is_inversed:
                sentiment_words_and_inverse_flag.append({
                    'word':
                    morphed.normal_form,
                    'is_inversed':
                    is_inversed
                })
            else:
                sentiment_words_and_inverse_flag.append(
                    {'word': morphed.normal_form})

        adjectives_normal_forms_length = len(sentiment_words_and_inverse_flag)
        window_radius = self.__get_window_radius()

        for (morphed_aspect, aspect_pos) in morphed_aspect_pos_pairs:
            left_pos = max(0, aspect_pos - window_radius)
            right_pos = min(aspect_pos + window_radius, len(words) - 1)
            min_dist = 9999
            effective_sentiment = ''
            for (morphed_adjective, adjective_pos,
                 _) in morphed_adjective_pos_inverse_flag_tuples:
                if adjective_pos < left_pos:
                    continue
                elif adjective_pos > right_pos:
                    break
                else:
                    dist = abs(aspect_pos - adjective_pos)
                    if self.__are_adjective_and_aspect_match(
                            morphed_adjective, morphed_aspect, dist, min_dist):
                        min_dist = dist
                        effective_sentiment = morphed_adjective.normal_form
            doc = {'aspect': morphed_aspect.normal_form}
            if effective_sentiment != '':
                doc['effective_sentiment'] = effective_sentiment
            if adjectives_normal_forms_length != 0:
                doc['sentiment_words'] = sentiment_words_and_inverse_flag
            if id in self.id_to_opinions.keys():
                self.id_to_opinions[id].append(doc)
            else:
                self.id_to_opinions[id] = [doc]

    def __is_inversed_polarity(self, words, adj_pos):
        if adj_pos == 0:
            return False
        return words[adj_pos - 1] == 'не'

    def __are_adjective_and_aspect_match(self, morphed_adjective,
                                         morphed_aspect, dist, min_dist):
        return (morphed_adjective.tag.POS == 'ADJF'
                and morphed_adjective.tag.case == morphed_aspect.tag.case
                or morphed_adjective.tag.POS == 'ADJS') and dist < min_dist

    def __scan_window_for_sentiment(self, id, sentence_words, aspect_position,
                                    window_radius):
        left_i = max(0, aspect_position - window_radius)
        right_i = min(aspect_position + window_radius, len(sentence_words) - 1)
        min_distance = 1000
        sentiment = ''
        aspect_morphed = self.morph.parse(sentence_words[aspect_position])[0]

        for i in range(left_i, right_i + 1):
            if i == aspect_position:
                continue
            for morphed in self.morph.parse(sentence_words[i])[:2]:
                if self.__are_words_consistent(morphed, aspect_morphed, i,
                                               aspect_position, min_distance):
                    min_distance = abs(i - aspect_position)
                    sentiment = morphed.normal_form
                    break
        doc = {'aspect': aspect_morphed.normal_form}
        if sentiment != '':
            doc['sentiment'] = sentiment
        if id in self.id_to_opinions.keys():
            self.id_to_opinions[id].append(doc)
        else:
            self.id_to_opinions[id] = [doc]

    def __are_words_consistent_1(self, morphed, aspect_morphed, word_position,
                                 aspect_position, min_distance):
        return (morphed.tag.POS == 'ADJF' and morphed.tag.case == aspect_morphed.tag.case or morphed.tag.POS == 'ADJS') and abs(word_position - aspect_position) < min_distance\
               # and morphed.tag.number == aspect_morphed.tag.number \

        # and morphed.tag.gender == aspect_morphed.tag.gender \
        # and abs(word_position - aspect_position) < min_distance

    def __are_words_consistent(self, morphed, aspect_morphed, word_position,
                               aspect_position, min_distance):
        if (word_position - aspect_position) >= min_distance:
            return False
        pos = morphed.tag.POS
        score = 0
        if pos == 'ADJF' or pos == 'ADJS' or pos == 'PRTF':
            if morphed.tag.case == aspect_morphed.tag.case:
                score += 1
            if morphed.tag.number == aspect_morphed.tag.number:
                score += 1
            if morphed.tag.gender == aspect_morphed.tag.gender:
                score += 1
            return score >= 2
        return False

    def write_opinions(self):
        self.mongo.write_opinions(self.id_to_opinions)