Python Base.Base示例，mainapp.daemons.base.Base.Base Python示例

示例#1

0

显示文件

文件： copy_publications.py 项目： Ancelada/canonizator

    def push(self, date):

        Base().connection()

        if date == None:
            publications = list(
                Publication.objects.using('manager').all().values(
                    *self.publication_table_columns).order_by('date')
                [:self.publications_count])
        else:
            publications = list(
                Publication.objects.using('manager').filter(
                    date__gte=date).values(*self.publication_table_columns).
                order_by('date')[:self.publications_count])

        # убираем дубли, если они существуют в manager.Publication
        publications = self.__remove_doubles(publications)

        # убираем дубли, если они существуют в canonizator.PublicationCopy
        if date != None:

            copypublications = CopyPublication.objects.filter(
                date__gte=date -
                timedelta(days=1)).values(*self.copypublication_table_columns)

            publications = self.__remove_doubles_by_copypublication_table(
                publications, copypublications)

        # записываем в CopyPublication publications_filtered
        copypublications = []

        for publication in publications:
            copypublications.append(
                CopyPublication(
                    crawler_id=publication['crawler__id'],
                    name=publication['crawler__name'],
                    name_cyrillic=publication['crawler__name_cyrillic'],
                    title=publication['title'],
                    text=publication['text'],
                    date=publication['date'],
                    author=publication['author'],
                ))

        count = len(copypublications)

        if count > 0:

            Base().connection()

            CopyPublication.objects.bulk_create(copypublications)

        self.save_status(count)

示例#2

0

显示文件

    def __init__(self):
        self.publications_count = 400
        self.name = 'Создание хешей публикаций'
        self.file_name = 'make_hashes'
        self.morth = pymorphy2.MorphAnalyzer()

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.vocabulary = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }

示例#3

0

显示文件

文件： links_synonims.py 项目： Ancelada/canonizator

    def __init__(self):
        self.name = 'Поиск синонимов'
        self.file_name = 'links_synonims'

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.list_value = 40
        self.voc_models = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }
        self.finded_synonims = None

示例#4

0

显示文件

    def __replace_synonims(self, vocabulary):

        Base().connection()

        if 'None' in vocabulary:
            del vocabulary['None']

        for key, words in vocabulary.items():

            pos_words = self.vocabulary[key].objects.filter(
                crc32__in=words).values('id', 'parent_id', 'crc32')

            parent_ids = []

            for pos_word in pos_words:
                parent_ids.append(pos_word['parent_id'])

            pos_parents = self.vocabulary[key].objects.filter(
                id__in=parent_ids).values('id', 'crc32')

            result = []

            for pos_word in pos_words:
                result_line = {}
                for pos_parent in pos_parents:
                    if pos_word['parent_id'] == pos_parent['id']:
                        result_line['word_parent'] = pos_parent['crc32']
                result_line['word'] = pos_word['crc32']
                result.append(result_line)

            vocabulary[key] = result

示例#5

0

显示文件

    def start(self):

        Base().connection()

        packet = NormalizePublication.objects.filter(
            title_hashes={}).order_by('pubdate')[:self.publications_count]

        # запрашиваем все слова
        vocabulary = self.__get_all_words(packet)

        # подтягиваем синонимы
        self.__replace_synonims(vocabulary)

        result = []

        for line in packet:

            result_line = {}

            title = self.__hash_list(line.title.split(' '))
            text = self.__hash_list(line.text.split(' '))

            result_line['title_hash'] = title
            result_line['text_hash'] = text

            # цепляем номера к заголовку
            result_line['title_words'] = self.__link_numbers(
                line.title_words, result_line['title_hash'], vocabulary)
            #складываем все слова
            result_line['title_words'] = self.__append_n_sort(
                result_line['title_words'])
            #создаем лист со словами
            result_line['title_words'] = self.__make_list_with_parents(
                result_line['title_words'])

            # цепляем номера к тексту
            result_line['text_words'] = self.__link_numbers(
                line.text_words, result_line['text_hash'], vocabulary)
            #складываем все слова
            result_line['text_words'] = self.__append_n_sort(
                result_line['text_words'])
            #создаем лист со словами
            result_line['text_words'] = self.__make_list_with_parents(
                result_line['text_words'])

            result.append({
                'id': line.id,
                'title_hashes': result_line['title_words'],
                'text_hashes': result_line['text_words'],
            })

        for line in packet:
            for result_line in result:
                if line.id == result_line['id']:
                    line.title_hashes = result_line['title_hashes']
                    line.text_hashes = result_line['text_hashes']

        bulk_update(packet)

        self.save_status(len(packet))

示例#6

0

显示文件

文件： incorrect_word_selection.py 项目： Ancelada/canonizator

    def __init__(self):
        self.name = 'Поиск некорректных слов'
        self.file_name = 'incorrect_word_selection'

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.list_value = 10
        self.voc_models = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }
        self.words_checked_count = None

示例#7

0

显示文件

文件： normalize_publications.py 项目： Ancelada/canonizator

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = NormalizePublicationError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

示例#8

0

显示文件

文件： links_synonims.py 项目： Ancelada/canonizator

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = VocabularyError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

示例#9

0

显示文件

文件： copy_publications.py 项目： Ancelada/canonizator

    def __init__(self):
        self.publications_count = 400
        self.name = 'Копирование'
        self.file_name = 'copy_publications'
        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.publication_table_columns = [
            'crawler__id',
            'crawler__name',
            'crawler__name_cyrillic',
            'title',
            'text',
            'date',
            'author',
        ]

        self.copypublication_table_columns = [
            'crawler_id',
            'title',
            'text',
            'date',
        ]

示例#10

0

显示文件

文件： links_synonims.py 项目： Ancelada/canonizator

 def run_daemon(self):
     try:
         self.context.open()
         with self.context:
             while True:
                 Base().update_working_status(self, 'waiting')
                 can_program = Base().can_program(self)
                 if can_program:
                     Base().update_working_status(self, 'working')
                     self.start()
                     Base().update_working_status(self, 'waiting')
                     Base().update_pidfile(self)
                     time.sleep(300)
                 else:
                     time.sleep(300)
     except Exception:
         self.save_error()

示例#11

0

显示文件

文件： normalize_publications.py 项目： Ancelada/canonizator

 def get_last_error(self):
     Base().connection()
     try:
         max_date = NormalizePublicationError.objects.all().aggregate(
             Max('date'))['date__max']
         last_error = NormalizePublicationError.objects.get(date=max_date)
     except:
         last_error = 'no status'
     return last_error

示例#12

0

显示文件

    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        MakeHashesStatus.objects.create(status=status, count=count)

示例#13

0

显示文件

文件： copy_publications.py 项目： Ancelada/canonizator

    def get_date(self):

        Base().connection()

        try:
            date = CopyPublication.objects.all().aggregate( \
             Max('date'))['date__max']
        except:
            date = None
        return date

示例#14

0

显示文件

文件： normalize_publications.py 项目： Ancelada/canonizator

    def get_last_pcopy_id(self):

        Base().connection()

        try:
            last_pcopy = NormalizePublication.objects.all().aggregate( \
             Max('CopyPublication_id'))['CopyPublication_id__max']
        except:
            last_pcopy = None
        return last_pcopy

示例#15

0

显示文件

文件： copy_publications.py 项目： Ancelada/canonizator

    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        CopyPublicationStatus.objects.create(status=status, count=count)

示例#16

0

显示文件

文件： normalize_publications.py 项目： Ancelada/canonizator

    def __remove_already_have(self, vocabulary):

        Base().connection()

        for key, value in vocabulary.items():
            doubles = self.voc_models[key].objects.filter(
                name__in=vocabulary[key]).values('name')
            for double in doubles:
                self.__remove_from_array_by_value(vocabulary[key],
                                                  double['name'])

示例#17

0

显示文件

文件： pubcompare.py 项目： Ancelada/canonizator

    def __init__(self):
        self.pub_without_status_length = 100
        self.retrospective_days_delta = 10
        self.name = 'Поиск нечетких дубликатов'
        self.file_name = 'pubcompare'

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

示例#18

0

显示文件

文件： links_synonims.py 项目： Ancelada/canonizator

    def save_status(self, count):

        Base().connection()

        if not count is None:
            status = 'Ok'
        else:
            status = 'Empty'
            count = 0

        VocabularyStatus.objects.create(status=status, count=count)

示例#19

0

显示文件

文件： links_synonims.py 项目： Ancelada/canonizator

    def get_last_error(self):

        Base().connection()

        try:
            max_date = VocabularyError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = VocabularyError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error

示例#20

0

显示文件

文件： copy_publications.py 项目： Ancelada/canonizator

    def get_last_status(self):

        Base().connection()

        try:
            max_date = CopyPublicationStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = CopyPublicationStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status

示例#21

0

显示文件

文件： normalize_publications.py 项目： Ancelada/canonizator

    def get_pcopy_list(self, last_pcopy):

        Base().connection()

        last_pcopy = self.get_last_pcopy_id()
        if last_pcopy != None:
            pcopy_list = CopyPublication.objects.filter(
                id__gt=last_pcopy).values(
                    *self.copypublication_fields)[:self.list_value]
        else:
            pcopy_list = CopyPublication.objects.all().values(
                *self.copypublication_fields)[:self.list_value]
        return pcopy_list

示例#22

0

显示文件

文件： pubcompare.py 项目： Ancelada/canonizator

    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        PubCompareStatus.objects.create(
            status=status,
            count=count,
        )

示例#23

0

显示文件

文件： normalize_publications.py 项目： Ancelada/canonizator

    def __add_vocabulary_grammems_to_remove_to_db(self, grammems_to_remove):

        Base().connection()

        for key in grammems_to_remove:
            words = []
            for word in grammems_to_remove[key]:
                words.append(self.grammems_to_remove_models[key](
                    name=word['word'],
                    crc32=word['crc32'],
                ))
            if len(words) > 0:
                now = timezone.now()
                self.grammems_to_remove_models[key].objects.bulk_create(words)

示例#24

0

显示文件

文件： normalize_publications.py 项目： Ancelada/canonizator

    def __remove_already_have_grammems_to_remove(self, grammems_to_remove):

        Base().connection()

        for key, value in grammems_to_remove.items():
            doubles = self.grammems_to_remove_models[key].objects.filter(
                crc32__in=[word['crc32'] for word in grammems_to_remove[key]
                           ]).values('crc32')

            for double in doubles:
                for key2, word in enumerate(value):
                    if word['crc32'] == double['crc32']:
                        del value[key2]
                        break

示例#25

0

显示文件

文件： normalize_publications.py 项目： Ancelada/canonizator

    def __add_vocabulary_to_db(self, vocabulary):

        Base().connection()

        for key in vocabulary:
            words = []
            for word in vocabulary[key]:
                words.append(self.voc_models[key](
                    name=word,
                    crc32=self.__convert_crc32(word),
                ))
            if len(words) > 0:
                now = timezone.now()
                self.voc_models[key].objects.bulk_create(words)

示例#26

0

显示文件

文件： links_synonims.py 项目： Ancelada/canonizator

    def start(self):

        Base().connection()

        for key, table in self.voc_models.items():
            words = table.objects.filter(
                vikidict_scaned=False)[:self.list_value]

            if len(words) > 0:
                result = Vikidict().start(words)
                self.update_db(table, result)

        # сохраняем количество обработанных слов
        self.save_status(self.finded_synonims)

        self.finded_synonims = None

示例#27

0

显示文件

文件： incorrect_word_selection.py 项目： Ancelada/canonizator

    def start(self):

        Base().connection()

        for key, table in self.voc_models.items():
            words = table.objects.filter(
                vikidict_correction_tested=False,
                Tone__isnull=True,
            )[:self.list_value]

            if len(words) > 0:
                result = VikidictCorr().start(words)
                self.update_db(table, result)

        # сохраняем количество обработанных слов
        self.save_status(self.words_checked_count)

        self.words_checked_count = None

示例#28

0

显示文件

文件： normalize_publications.py 项目： Ancelada/canonizator

    def save(self, normalized_list):

        Base().connection()

        normalized_publications = []
        for item in normalized_list:
            normalized_publications.append(
                NormalizePublication(
                    crawler_id=item['crawler_id'],
                    name=item['name'],
                    name_cyrillic=item['name_cyrillic'],
                    title=item['title'],
                    text=item['text'],
                    author=item['author'],
                    pubdate=item['date'],
                    CopyPublication_id=item['id'],
                    title_words=item['title_words'],
                    text_words=item['text_words'],
                ))
        count = len(normalized_publications)
        if count > 0:
            NormalizePublication.objects.bulk_create(normalized_publications)

        self.save_status(count)

示例#29

0

显示文件

文件： normalize_publications.py 项目： Ancelada/canonizator

    def __init__(self):
        self.list_value = 400
        self.name = 'Канонизация'
        self.file_name = 'normalize_publications'
        self.morth = pymorphy2.MorphAnalyzer()

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.punctuations = re.compile(
            '([-_<>?/\\".„”“%,{}@#!&()=+:;«»—$&£*])')
        self.replace_with_spaces = {
            '\n',
            '\r',
            '\r\n',
            '\v',
            '\x0b',
            '\f',
            '\x0c',
            '\x1c',
            '\x1d',
            '\x1e',
            '\x85',
            '\u2028',
            '\u2029',
            '<br>',
            '<br />'
            '<p>',
            '</p>',
            '...',
            '\t',
            '\xa0',
            '&nbsp',
            ' ',
        }
        self.copypublication_fields = [
            'crawler_id',
            'name',
            'name_cyrillic',
            'title',
            'text',
            'author',
            'date',
            'id',
        ]

        self.grammems_to_remove = {
            'NPRO', 'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ', 'ROMN', 'UNKN'
        }

        self.grammems_to_remove_vocabulary = {
            'NPRO': [],
            'PRED': [],
            'PREP': [],
            'CONJ': [],
            'PRCL': [],
            'INTJ': [],
            'ROMN': [],
            'UNKN': [],
        }

        self.grammems_to_remove_models = {
            'NPRO': NPRO,
            'PRED': PRED,
            'PREP': PREP,
            'CONJ': CONJ,
            'PRCL': PRCL,
            'INTJ': INTJ,
            'ROMN': ROMN,
            'UNKN': UNKN,
        }

        self.vocabulary = {
            'NOUN': [],
            'ADJF': [],
            'ADJS': [],
            'COMP': [],
            'VERB': [],
            'INFN': [],
            'PRTF': [],
            'PRTS': [],
            'GRND': [],
            'NUMR': [],
            'ADVB': [],
            'LATN': [],
            'NUMB': [],
            'intg': [],
            'real': [],
        }

        self.voc_models = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }

示例#30

0

显示文件

文件： links_synonims.py 项目： Ancelada/canonizator

    def clear_vocabulary(self):
        Base().connection()

        for key, value in self.voc_models.items():
            value.objects.all().delete()