Exemplo n.º 1
0
class Program:
    def __init__(self):
        self.name = 'Поиск синонимов'
        self.file_name = 'links_synonims'

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.list_value = 40
        self.voc_models = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }
        self.finded_synonims = None

    def get_last_status(self):

        Base().connection()

        try:
            max_date = VocabularyStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = VocabularyStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status

    def get_last_error(self):

        Base().connection()

        try:
            max_date = VocabularyError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = VocabularyError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = VocabularyError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

    def save_status(self, count):

        Base().connection()

        if not count is None:
            status = 'Ok'
        else:
            status = 'Empty'
            count = 0

        VocabularyStatus.objects.create(status=status, count=count)

    def start(self):

        Base().connection()

        for key, table in self.voc_models.items():
            words = table.objects.filter(
                vikidict_scaned=False)[:self.list_value]

            if len(words) > 0:
                result = Vikidict().start(words)
                self.update_db(table, result)

        # сохраняем количество обработанных слов
        self.save_status(self.finded_synonims)

        self.finded_synonims = None

    def update_db(self, table, result):
        #записываем синонимы
        words_ids = []
        for line in result:
            for synonim in line['synonims']:
                word = table.objects.filter(crc32=synonim['crc32'])
                if word.exists():
                    word.update(
                        level=1,
                        parent_id=line['id'],
                        vikidict_scaned=True,
                    )
                else:
                    table.objects.create(name=synonim['synonim'],
                                         crc32=synonim['crc32'],
                                         vikidict_scaned=True)
            words_ids.append(line['id'])
        #отмечаем обработанные слова
        table.objects.filter(id__in=words_ids).update(vikidict_scaned=True)

        if self.finded_synonims == None:
            self.finded_synonims = len(words_ids)
        else:
            self.finded_synonims += len(words_ids)

    # функция очищения всех словарей
    def clear_vocabulary(self):
        Base().connection()

        for key, value in self.voc_models.items():
            value.objects.all().delete()

    ########################################
    # запуск программы
    def run_daemon(self):
        try:
            self.context.open()
            with self.context:
                while True:
                    Base().update_working_status(self, 'waiting')
                    can_program = Base().can_program(self)
                    if can_program:
                        Base().update_working_status(self, 'working')
                        self.start()
                        Base().update_working_status(self, 'waiting')
                        Base().update_pidfile(self)
                        time.sleep(300)
                    else:
                        time.sleep(300)
        except Exception:
            self.save_error()

    def get_pid(self):

        processes = psutil.pids()

        directory = self.pids_dir
        pid_file = open(
            os.path.join(directory, '{0}.pid'.format(self.file_name)), "r")

        with pid_file:
            pid_value = int(pid_file.readlines()[0])

            pid_file.close()

            if pid_value in processes:
                return pid_value
            else:
                os.remove(
                    os.path.join(directory, '{0}.pid'.format(self.file_name)))
                return None
Exemplo n.º 2
0
class Program:
    def __init__(self):
        self.publications_count = 400
        self.name = 'Копирование'
        self.file_name = 'copy_publications'
        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.publication_table_columns = [
            'crawler__id',
            'crawler__name',
            'crawler__name_cyrillic',
            'title',
            'text',
            'date',
            'author',
        ]

        self.copypublication_table_columns = [
            'crawler_id',
            'title',
            'text',
            'date',
        ]

    def get_last_status(self):

        Base().connection()

        try:
            max_date = CopyPublicationStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = CopyPublicationStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status

    def get_last_error(self):

        Base().connection()

        try:
            max_date = CopyPublicationError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = CopyPublicationError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = CopyPublicationError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        CopyPublicationStatus.objects.create(status=status, count=count)

    def get_date(self):

        Base().connection()

        try:
            date = CopyPublication.objects.all().aggregate( \
             Max('date'))['date__max']
        except:
            date = None
        return date

    def __remove_doubles(self, publications):
        for key, publication in enumerate(publications):
            if any(p['title'] == publication['title'] and p['text'] == publication['text'] and \
             p['date'] == publication['date'] and p['crawler__id'] == publication['crawler__id'] \
             for p in publications[key+1:]):

                del publications[key]

                return self.__remove_doubles(publications)

        return publications

    def __remove_doubles_by_copypublication_table(self, publications,
                                                  copypublications):
        for key, publication in enumerate(publications):
            if any(p['crawler_id'] == publication['crawler__id'] and p['title']==publication['title'] \
             and p['text'] == publication['text'] and p['date'] == publication['date'] \
              for p in copypublications):

                del publications[key]

                return self.__remove_doubles_by_copypublication_table(
                    publications, copypublications)
        return publications

    def push(self, date):

        Base().connection()

        if date == None:
            publications = list(
                Publication.objects.using('manager').all().values(
                    *self.publication_table_columns).order_by('date')
                [:self.publications_count])
        else:
            publications = list(
                Publication.objects.using('manager').filter(
                    date__gte=date).values(*self.publication_table_columns).
                order_by('date')[:self.publications_count])

        # убираем дубли, если они существуют в manager.Publication
        publications = self.__remove_doubles(publications)

        # убираем дубли, если они существуют в canonizator.PublicationCopy
        if date != None:

            copypublications = CopyPublication.objects.filter(
                date__gte=date -
                timedelta(days=1)).values(*self.copypublication_table_columns)

            publications = self.__remove_doubles_by_copypublication_table(
                publications, copypublications)

        # записываем в CopyPublication publications_filtered
        copypublications = []

        for publication in publications:
            copypublications.append(
                CopyPublication(
                    crawler_id=publication['crawler__id'],
                    name=publication['crawler__name'],
                    name_cyrillic=publication['crawler__name_cyrillic'],
                    title=publication['title'],
                    text=publication['text'],
                    date=publication['date'],
                    author=publication['author'],
                ))

        count = len(copypublications)

        if count > 0:

            Base().connection()

            CopyPublication.objects.bulk_create(copypublications)

        self.save_status(count)

    ########################################
    # запуск программы
    def run_daemon(self):
        try:
            self.context.open()
            with self.context:
                while True:
                    Base().update_working_status(self, 'waiting')
                    can_program = Base().can_program(self)
                    if can_program:
                        Base().update_working_status(self, 'working')
                        date = self.get_date()
                        self.push(date)
                        Base().update_working_status(self, 'waiting')
                        Base().update_pidfile(self)
                        time.sleep(300)
                    else:
                        time.sleep(300)
        except Exception:
            self.save_error()

    def get_pid(self):

        processes = psutil.pids()

        directory = self.pids_dir
        pid_file = open(
            os.path.join(directory, '{0}.pid'.format(self.file_name)), "r")

        with pid_file:
            pid_value = int(pid_file.readlines()[0])

            pid_file.close()

            if pid_value in processes:
                return pid_value
            else:
                os.remove(
                    os.path.join(directory, '{0}.pid'.format(self.file_name)))
                return None
class Program:
    def __init__(self):
        self.list_value = 400
        self.name = 'Канонизация'
        self.file_name = 'normalize_publications'
        self.morth = pymorphy2.MorphAnalyzer()

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.punctuations = re.compile(
            '([-_<>?/\\".„”“%,{}@#!&()=+:;«»—$&£*])')
        self.replace_with_spaces = {
            '\n',
            '\r',
            '\r\n',
            '\v',
            '\x0b',
            '\f',
            '\x0c',
            '\x1c',
            '\x1d',
            '\x1e',
            '\x85',
            '\u2028',
            '\u2029',
            '<br>',
            '<br />'
            '<p>',
            '</p>',
            '...',
            '\t',
            '\xa0',
            '&nbsp',
            ' ',
        }
        self.copypublication_fields = [
            'crawler_id',
            'name',
            'name_cyrillic',
            'title',
            'text',
            'author',
            'date',
            'id',
        ]

        self.grammems_to_remove = {
            'NPRO', 'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ', 'ROMN', 'UNKN'
        }

        self.grammems_to_remove_vocabulary = {
            'NPRO': [],
            'PRED': [],
            'PREP': [],
            'CONJ': [],
            'PRCL': [],
            'INTJ': [],
            'ROMN': [],
            'UNKN': [],
        }

        self.grammems_to_remove_models = {
            'NPRO': NPRO,
            'PRED': PRED,
            'PREP': PREP,
            'CONJ': CONJ,
            'PRCL': PRCL,
            'INTJ': INTJ,
            'ROMN': ROMN,
            'UNKN': UNKN,
        }

        self.vocabulary = {
            'NOUN': [],
            'ADJF': [],
            'ADJS': [],
            'COMP': [],
            'VERB': [],
            'INFN': [],
            'PRTF': [],
            'PRTS': [],
            'GRND': [],
            'NUMR': [],
            'ADVB': [],
            'LATN': [],
            'NUMB': [],
            'intg': [],
            'real': [],
        }

        self.voc_models = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }

    def get_last_status(self):
        Base().connection()
        try:
            max_date = NormalizePublicationStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = NormalizePublicationStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status

    def get_last_error(self):
        Base().connection()
        try:
            max_date = NormalizePublicationError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = NormalizePublicationError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error

    def __clear_vocabulary(self, vocabulary):
        for key, value in vocabulary.items():
            del vocabulary[key][:]

    def start(self):
        last_pcopy = self.get_last_pcopy_id()
        pcopy_list = self.get_pcopy_list(last_pcopy)
        normalized_list = self.normalize(pcopy_list)
        self.save(normalized_list)

        ###############################
        # обработка словаря
        self.__remove_doubles(self.vocabulary)
        self.__remove_already_have(self.vocabulary)
        self.__add_vocabulary_to_db(self.vocabulary)

        self.__clear_vocabulary(self.vocabulary)

        # записываем граммемы to remove
        self.__remove_already_have_grammems_to_remove(
            self.grammems_to_remove_vocabulary)
        self.__add_vocabulary_grammems_to_remove_to_db(
            self.grammems_to_remove_vocabulary)

        self.__clear_vocabulary(self.grammems_to_remove_vocabulary)

    #### функция удаления дубликатов значений списков в словаре
    def __remove_doubles(self, vocabulary):
        for key in vocabulary:
            vocabulary[key] = list(unique_everseen(vocabulary[key]))

    ### функция удаления уже имеющихся в БД
    def __remove_already_have_grammems_to_remove(self, grammems_to_remove):

        Base().connection()

        for key, value in grammems_to_remove.items():
            doubles = self.grammems_to_remove_models[key].objects.filter(
                crc32__in=[word['crc32'] for word in grammems_to_remove[key]
                           ]).values('crc32')

            for double in doubles:
                for key2, word in enumerate(value):
                    if word['crc32'] == double['crc32']:
                        del value[key2]
                        break

    #### функция удаления уже имеющихся в БД
    def __remove_already_have(self, vocabulary):

        Base().connection()

        for key, value in vocabulary.items():
            doubles = self.voc_models[key].objects.filter(
                name__in=vocabulary[key]).values('name')
            for double in doubles:
                self.__remove_from_array_by_value(vocabulary[key],
                                                  double['name'])

    ##### удаление из массива по значению
    def __remove_from_array_by_value(self, array, value):
        if value in array:
            array.remove(value)

    ##### добавление в БД списков частей речи на удаление
    def __add_vocabulary_grammems_to_remove_to_db(self, grammems_to_remove):

        Base().connection()

        for key in grammems_to_remove:
            words = []
            for word in grammems_to_remove[key]:
                words.append(self.grammems_to_remove_models[key](
                    name=word['word'],
                    crc32=word['crc32'],
                ))
            if len(words) > 0:
                now = timezone.now()
                self.grammems_to_remove_models[key].objects.bulk_create(words)

    ##### добавление в БД списков частей речи
    def __add_vocabulary_to_db(self, vocabulary):

        Base().connection()

        for key in vocabulary:
            words = []
            for word in vocabulary[key]:
                words.append(self.voc_models[key](
                    name=word,
                    crc32=self.__convert_crc32(word),
                ))
            if len(words) > 0:
                now = timezone.now()
                self.voc_models[key].objects.bulk_create(words)

    def __convert_crc32(self, value):
        value_bytes = bytes(value, 'utf-8')
        return binascii.crc32(value_bytes)

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = NormalizePublicationError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        NormalizePublicationStatus.objects.create(status=status, count=count)

    def remove_punctuation(self, string):
        for key in self.replace_with_spaces:
            string = string.replace(key, ' ')
        string = re.sub(self.punctuations, '', string)
        string = string.replace('ё', 'е')
        return string

    def split_line(self, line):
        words_list = line.split(' ')
        return words_list

    def parse_to_morph(self, word):
        return self.morth.parse(word)[0]

    def check_word(self, parsed_to_morph):
        if parsed_to_morph.tag.POS in self.grammems_to_remove:
            return False
        else:
            return True

    def normalize_word(self, parsed_to_morph):
        ##### нужно уменьшить размер слова, заменить буквы ё на е
        normal_form = parsed_to_morph.normal_form
        # наполненяем словарь каждым встречающимся словом
        self.fill_vocabulary(parsed_to_morph, normal_form)
        return normal_form

    # наполнение словаря
    def fill_vocabulary(self, parsed_to_morph, normal_form):
        pos = parsed_to_morph.tag.POS
        if pos in self.vocabulary:
            self.vocabulary[pos].append(normal_form)

    def get_last_pcopy_id(self):

        Base().connection()

        try:
            last_pcopy = NormalizePublication.objects.all().aggregate( \
             Max('CopyPublication_id'))['CopyPublication_id__max']
        except:
            last_pcopy = None
        return last_pcopy

    def get_pcopy_list(self, last_pcopy):

        Base().connection()

        last_pcopy = self.get_last_pcopy_id()
        if last_pcopy != None:
            pcopy_list = CopyPublication.objects.filter(
                id__gt=last_pcopy).values(
                    *self.copypublication_fields)[:self.list_value]
        else:
            pcopy_list = CopyPublication.objects.all().values(
                *self.copypublication_fields)[:self.list_value]
        return pcopy_list

    def normalize(self, pcopy_list):
        for pcopy in pcopy_list:

            pcopy['title'] = self.remove_punctuation(pcopy['title'])
            pcopy['text'] = self.remove_punctuation(pcopy['text'])

            title = []
            title_words = {}
            self.__check_n_normalize(title, title_words,
                                     self.split_line(pcopy['title']))
            pcopy['title'] = ' '.join(title)
            pcopy['title_words'] = title_words

            text = []
            text_words = {}
            self.__check_n_normalize(text, text_words,
                                     self.split_line(pcopy['text']))
            pcopy['text'] = ' '.join(text)
            pcopy['text_words'] = text_words

        return pcopy_list

    def __check_n_normalize(self, exp_list, exp_voc_list, words):
        for word in words:
            word_parsed_to_morph = self.parse_to_morph(word)
            if self.check_word(word_parsed_to_morph):

                normalized_word = self.normalize_word(word_parsed_to_morph)
                word_crc_32 = self.__convert_crc32(normalized_word)

                # обычный список нормализованных слов
                exp_list.append(normalized_word)

                # словарь частей речи со словами crc32
                pos = str(word_parsed_to_morph.tag.POS)

                if not pos in exp_voc_list:
                    exp_voc_list[pos] = [word_crc_32]
                else:
                    exp_voc_list[pos].append(word_crc_32)
            # заполняем словарь частей речи, не учавствующих в разборе нечетких дублей
            else:

                normalized_word = self.normalize_word(word_parsed_to_morph)
                word_crc_32 = self.__convert_crc32(normalized_word)

                pos = str(word_parsed_to_morph.tag.POS)

                if not any(voc_word['word'] == normalized_word \
                 for voc_word in self.grammems_to_remove_vocabulary[pos]):
                    self.grammems_to_remove_vocabulary[pos].append({
                        'word':
                        normalized_word,
                        'crc32':
                        word_crc_32
                    })

    def save(self, normalized_list):

        Base().connection()

        normalized_publications = []
        for item in normalized_list:
            normalized_publications.append(
                NormalizePublication(
                    crawler_id=item['crawler_id'],
                    name=item['name'],
                    name_cyrillic=item['name_cyrillic'],
                    title=item['title'],
                    text=item['text'],
                    author=item['author'],
                    pubdate=item['date'],
                    CopyPublication_id=item['id'],
                    title_words=item['title_words'],
                    text_words=item['text_words'],
                ))
        count = len(normalized_publications)
        if count > 0:
            NormalizePublication.objects.bulk_create(normalized_publications)

        self.save_status(count)

    ########################################
    # запуск программы
    def run_daemon(self):
        try:
            self.context.open()
            with self.context:
                while True:
                    Base().update_working_status(self, 'waiting')
                    can_program = Base().can_program(self)
                    if can_program:
                        Base().update_working_status(self, 'working')
                        self.start()
                        Base().update_working_status(self, 'waiting')
                        Base().update_pidfile(self)
                        time.sleep(300)
                    else:
                        time.sleep(300)
        except Exception:
            self.save_error()

    def get_pid(self):

        processes = psutil.pids()

        directory = self.pids_dir
        pid_file = open(
            os.path.join(directory, '{0}.pid'.format(self.file_name)), "r")

        with pid_file:
            pid_value = int(pid_file.readlines()[0])

            pid_file.close()

            if pid_value in processes:
                return pid_value
            else:
                os.remove(
                    os.path.join(directory, '{0}.pid'.format(self.file_name)))
                return None
Exemplo n.º 4
0
class Program():
    def __init__(self):
        self.pub_without_status_length = 100
        self.retrospective_days_delta = 10
        self.name = 'Поиск нечетких дубликатов'
        self.file_name = 'pubcompare'

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

    def get_last_status(self):

        Base().connection()

        try:
            max_date = PubCompareStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = PubCompareStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status

    def get_last_error(self):

        Base().connection()

        try:
            max_date = PubCompareError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = PubCompareError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = PubCompareError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        PubCompareStatus.objects.create(
            status=status,
            count=count,
        )

    def __get_pub_without_status_min_date(self, pub_list):
        return pub_list[0].pubdate

    def get_pub_without_status(self):
        pub_list = NormalizePublication.objects.filter(
            status__isnull=True).exclude(title_hashes={}).order_by(
                'pubdate')[:self.pub_without_status_length]
        return pub_list

    def __get_unique_publications_min_date(self, pub_without_status_min_date):
        return pub_without_status_min_date - timezone.timedelta(
            days=self.retrospective_days_delta)

    def __get_unique_publications(self, pub_without_status_min_date):

        min_date_unique = self.__get_unique_publications_min_date(
            pub_without_status_min_date)

        pub_unique = NormalizePublication.objects.filter(
            pubdate__gt=min_date_unique,
            pubdate__lt=pub_without_status_min_date,
            status=PubCompare().status['unique']['db_value']).values(
                'id', 'title_hashes', 'text_hashes')

        result = []
        for pub in pub_unique:
            result.append({
                'id': pub['id'],
                'title_hashes': pub['title_hashes'],
                'text_hashes': pub['text_hashes']
            })

        return result

    def start(self):
        # список публикаций без статуса
        publications = Program().get_pub_without_status()

        pub_without_status_min_date = self.__get_pub_without_status_min_date(
            publications)

        # список уникальных публикаций
        unique_publications = self.__get_unique_publications(
            pub_without_status_min_date)

        # поиск статуса
        self.__search_status(publications, unique_publications)

        bulk_update(publications)

        self.save_status(len(publications))

    def __search_status(self, publications, unique_publications):
        for publication in publications:
            self.__compare_publication_with_unique_publications(
                publication, unique_publications)

    def __compare_publication_with_unique_publications(self, publication,
                                                       unique_publications):

        if len(unique_publications) > 0:
            for unique_publication in unique_publications:
                result = PubCompare().get_status(publication,
                                                 unique_publication)
                if result['status'] == 'reprint':
                    publication.status = PubCompare(
                    ).status['reprint']['db_value']
                    publication.parent_id = unique_publication['id']
                    break
                if result['status'] == 'copy':
                    publication.status = PubCompare(
                    ).status['copy']['db_value']
                    publication.parent_id = unique_publication['id']
                    break
            if publication.status == None:
                publication.status = PubCompare().status['unique']['db_value']
                self.__add_publication_in_unique_publications(
                    publication, unique_publications)
        else:
            publication.status = PubCompare().status['unique']['db_value']
            self.__add_publication_in_unique_publications(
                publication, unique_publications)

    def __add_publication_in_unique_publications(self, publication,
                                                 unique_publications):
        unique_publications.append({
            'id': publication.id,
            'title_hashes': publication.title_hashes,
            'text_hashes': publication.text_hashes,
        })

    def clear_statuses(self):
        NormalizePublication.objects.exclude(status__isnull=True).update(
            status=None)

    ########################################
    # запуск программы
    def run_daemon(self):
        try:
            self.context.open()
            with self.context:
                while True:
                    Base().update_working_status(self, 'waiting')
                    can_program = Base().can_program(self)
                    if can_program:
                        Base().update_working_status(self, 'working')
                        self.start()
                        Base().update_working_status(self, 'waiting')
                        Base().update_pidfile(self)
                        time.sleep(300)
                    else:
                        time.sleep(300)
        except Exception:
            self.save_error()

    def get_pid(self):

        processes = psutil.pids()

        directory = self.pids_dir
        pid_file = open(
            os.path.join(directory, '{0}.pid'.format(self.file_name)), "r")

        with pid_file:
            pid_value = int(pid_file.readlines()[0])

            pid_file.close()

            if pid_value in processes:
                return pid_value
            else:
                os.remove(
                    os.path.join(directory, '{0}.pid'.format(self.file_name)))
                return None
Exemplo n.º 5
0
class Program:
    def __init__(self):
        self.publications_count = 400
        self.name = 'Создание хешей публикаций'
        self.file_name = 'make_hashes'
        self.morth = pymorphy2.MorphAnalyzer()

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.vocabulary = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }

    def get_last_status(self):

        Base().connection()

        try:
            max_date = MakeHashesStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = MakeHashesStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status

    def get_last_error(self):

        Base().connection()

        try:
            max_date = MakeHashesError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = MakeHashesError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = MakeHashesError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        MakeHashesStatus.objects.create(status=status, count=count)

    def __get_all_words(self, packet):
        vocabulary = {}
        vocabulary['None'] = []

        for key, value in self.vocabulary.items():
            vocabulary[key] = []

        for line in packet:

            for key, words in line.title_words.items():
                self.__add_in_vocabulary_if_not_exists(key, words, vocabulary)

            for key, words in line.text_words.items():
                self.__add_in_vocabulary_if_not_exists(key, words, vocabulary)

        return vocabulary

    def __add_in_vocabulary_if_not_exists(self, key, words, vocabulary):
        for word in words:
            if not word in vocabulary[key]:
                vocabulary[key].append(word)

    def __replace_synonims(self, vocabulary):

        Base().connection()

        if 'None' in vocabulary:
            del vocabulary['None']

        for key, words in vocabulary.items():

            pos_words = self.vocabulary[key].objects.filter(
                crc32__in=words).values('id', 'parent_id', 'crc32')

            parent_ids = []

            for pos_word in pos_words:
                parent_ids.append(pos_word['parent_id'])

            pos_parents = self.vocabulary[key].objects.filter(
                id__in=parent_ids).values('id', 'crc32')

            result = []

            for pos_word in pos_words:
                result_line = {}
                for pos_parent in pos_parents:
                    if pos_word['parent_id'] == pos_parent['id']:
                        result_line['word_parent'] = pos_parent['crc32']
                result_line['word'] = pos_word['crc32']
                result.append(result_line)

            vocabulary[key] = result

    def __add_parent(self, result, vocabulary):
        for pos, words in vocabulary.items():
            doubled = 0
            for word in words:
                if word['word'] == result['word']:
                    if 'word_parent' in word:
                        result['word_parent'] = word['word_parent']
                        doubled = 1
                        break
            if doubled == 0:
                break

    def __link_numbers(self, poses_words, poses_hash, vocabulary):
        result = {}
        for pos, words in poses_words.items():
            result_list = []
            for word in words:
                result_line = {}
                no = self.__find_number(word, poses_hash)
                result_line['word'] = word
                result_line['no'] = no

                # добавляем родителя
                self.__add_parent(result_line, vocabulary)

                result_list.append(result_line)
            result[pos] = result_list
        return result

    def __append_n_sort(self, line_words):
        result = []
        for pos, words in line_words.items():
            if pos != 'None':
                for word in words:
                    result.append(word)

        return sorted(result, key=lambda word: word['no'])

    def __make_list_with_parents(self, line_words):
        result = []
        for line_word in line_words:
            if 'word_parent' in line_word:
                result.append(line_word['word_parent'])
            else:
                result.append(line_word['word'])
        return result

    def start(self):

        Base().connection()

        packet = NormalizePublication.objects.filter(
            title_hashes={}).order_by('pubdate')[:self.publications_count]

        # запрашиваем все слова
        vocabulary = self.__get_all_words(packet)

        # подтягиваем синонимы
        self.__replace_synonims(vocabulary)

        result = []

        for line in packet:

            result_line = {}

            title = self.__hash_list(line.title.split(' '))
            text = self.__hash_list(line.text.split(' '))

            result_line['title_hash'] = title
            result_line['text_hash'] = text

            # цепляем номера к заголовку
            result_line['title_words'] = self.__link_numbers(
                line.title_words, result_line['title_hash'], vocabulary)
            #складываем все слова
            result_line['title_words'] = self.__append_n_sort(
                result_line['title_words'])
            #создаем лист со словами
            result_line['title_words'] = self.__make_list_with_parents(
                result_line['title_words'])

            # цепляем номера к тексту
            result_line['text_words'] = self.__link_numbers(
                line.text_words, result_line['text_hash'], vocabulary)
            #складываем все слова
            result_line['text_words'] = self.__append_n_sort(
                result_line['text_words'])
            #создаем лист со словами
            result_line['text_words'] = self.__make_list_with_parents(
                result_line['text_words'])

            result.append({
                'id': line.id,
                'title_hashes': result_line['title_words'],
                'text_hashes': result_line['text_words'],
            })

        for line in packet:
            for result_line in result:
                if line.id == result_line['id']:
                    line.title_hashes = result_line['title_hashes']
                    line.text_hashes = result_line['text_hashes']

        bulk_update(packet)

        self.save_status(len(packet))

    def __append_numbers(self, list_words):
        result_list = []
        for pos, words in list_words.items():
            result_line = []
            for word in words:
                no = self.__find_number(word, list_words['title_has'])

    def __find_number(self, word_to_find, words_list):
        for key, word in enumerate(words_list):
            if word_to_find == word:
                return key

    def __hash_list(self, words_list):
        crc32 = []
        for word in words_list:
            crc32.append(binascii.crc32(bytes(word, 'utf-8')))
        return crc32

    def delete_hashes(self):

        to_delete = NormalizePublication.objects.exclude(
            title_hashes={}).update(
                title_hashes={},
                text_hashes={},
            )

    ########################################
    # запуск программы
    def run_daemon(self):
        try:
            self.context.open()
            with self.context:
                while True:
                    Base().update_working_status(self, 'waiting')
                    can_program = Base().can_program(self)
                    if can_program:
                        Base().update_working_status(self, 'working')
                        self.start()
                        Base().update_working_status(self, 'waiting')
                        Base().update_pidfile(self)
                        time.sleep(300)
                    else:
                        time.sleep(300)
        except Exception:
            self.save_error()

    def get_pid(self):

        processes = psutil.pids()

        directory = self.pids_dir
        pid_file = open(
            os.path.join(directory, '{0}.pid'.format(self.file_name)), "r")

        with pid_file:
            pid_value = int(pid_file.readlines()[0])

            pid_file.close()

            if pid_value in processes:
                return pid_value
            else:
                os.remove(
                    os.path.join(directory, '{0}.pid'.format(self.file_name)))
                return None