def push(self, date): Base().connection() if date == None: publications = list( Publication.objects.using('manager').all().values( *self.publication_table_columns).order_by('date') [:self.publications_count]) else: publications = list( Publication.objects.using('manager').filter( date__gte=date).values(*self.publication_table_columns). order_by('date')[:self.publications_count]) # убираем дубли, если они существуют в manager.Publication publications = self.__remove_doubles(publications) # убираем дубли, если они существуют в canonizator.PublicationCopy if date != None: copypublications = CopyPublication.objects.filter( date__gte=date - timedelta(days=1)).values(*self.copypublication_table_columns) publications = self.__remove_doubles_by_copypublication_table( publications, copypublications) # записываем в CopyPublication publications_filtered copypublications = [] for publication in publications: copypublications.append( CopyPublication( crawler_id=publication['crawler__id'], name=publication['crawler__name'], name_cyrillic=publication['crawler__name_cyrillic'], title=publication['title'], text=publication['text'], date=publication['date'], author=publication['author'], )) count = len(copypublications) if count > 0: Base().connection() CopyPublication.objects.bulk_create(copypublications) self.save_status(count)
def __init__(self): self.publications_count = 400 self.name = 'Создание хешей публикаций' self.file_name = 'make_hashes' self.morth = pymorphy2.MorphAnalyzer() self.base_dir = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) self.pids_dir = os.path.join(self.base_dir, 'daemons/pids') self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses') self.context = Base().create_daemon_context(self.file_name) self.vocabulary = { 'NOUN': NOUN, 'ADJF': ADJF, 'ADJS': ADJS, 'COMP': COMP, 'VERB': VERB, 'INFN': INFN, 'PRTF': PRTF, 'PRTS': PRTS, 'GRND': GRND, 'NUMR': NUMR, 'ADVB': ADVB, 'LATN': LATN, 'NUMB': NUMB, 'intg': intg, 'real': real, }
def __init__(self): self.name = 'Поиск синонимов' self.file_name = 'links_synonims' self.base_dir = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) self.pids_dir = os.path.join(self.base_dir, 'daemons/pids') self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses') self.context = Base().create_daemon_context(self.file_name) self.list_value = 40 self.voc_models = { 'NOUN': NOUN, 'ADJF': ADJF, 'ADJS': ADJS, 'COMP': COMP, 'VERB': VERB, 'INFN': INFN, 'PRTF': PRTF, 'PRTS': PRTS, 'GRND': GRND, 'NUMR': NUMR, 'ADVB': ADVB, 'LATN': LATN, 'NUMB': NUMB, 'intg': intg, 'real': real, } self.finded_synonims = None
def __replace_synonims(self, vocabulary): Base().connection() if 'None' in vocabulary: del vocabulary['None'] for key, words in vocabulary.items(): pos_words = self.vocabulary[key].objects.filter( crc32__in=words).values('id', 'parent_id', 'crc32') parent_ids = [] for pos_word in pos_words: parent_ids.append(pos_word['parent_id']) pos_parents = self.vocabulary[key].objects.filter( id__in=parent_ids).values('id', 'crc32') result = [] for pos_word in pos_words: result_line = {} for pos_parent in pos_parents: if pos_word['parent_id'] == pos_parent['id']: result_line['word_parent'] = pos_parent['crc32'] result_line['word'] = pos_word['crc32'] result.append(result_line) vocabulary[key] = result
def start(self): Base().connection() packet = NormalizePublication.objects.filter( title_hashes={}).order_by('pubdate')[:self.publications_count] # запрашиваем все слова vocabulary = self.__get_all_words(packet) # подтягиваем синонимы self.__replace_synonims(vocabulary) result = [] for line in packet: result_line = {} title = self.__hash_list(line.title.split(' ')) text = self.__hash_list(line.text.split(' ')) result_line['title_hash'] = title result_line['text_hash'] = text # цепляем номера к заголовку result_line['title_words'] = self.__link_numbers( line.title_words, result_line['title_hash'], vocabulary) #складываем все слова result_line['title_words'] = self.__append_n_sort( result_line['title_words']) #создаем лист со словами result_line['title_words'] = self.__make_list_with_parents( result_line['title_words']) # цепляем номера к тексту result_line['text_words'] = self.__link_numbers( line.text_words, result_line['text_hash'], vocabulary) #складываем все слова result_line['text_words'] = self.__append_n_sort( result_line['text_words']) #создаем лист со словами result_line['text_words'] = self.__make_list_with_parents( result_line['text_words']) result.append({ 'id': line.id, 'title_hashes': result_line['title_words'], 'text_hashes': result_line['text_words'], }) for line in packet: for result_line in result: if line.id == result_line['id']: line.title_hashes = result_line['title_hashes'] line.text_hashes = result_line['text_hashes'] bulk_update(packet) self.save_status(len(packet))
def __init__(self): self.name = 'Поиск некорректных слов' self.file_name = 'incorrect_word_selection' self.base_dir = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) self.pids_dir = os.path.join(self.base_dir, 'daemons/pids') self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses') self.context = Base().create_daemon_context(self.file_name) self.list_value = 10 self.voc_models = { 'NOUN': NOUN, 'ADJF': ADJF, 'ADJS': ADJS, 'COMP': COMP, 'VERB': VERB, 'INFN': INFN, 'PRTF': PRTF, 'PRTS': PRTS, 'GRND': GRND, 'NUMR': NUMR, 'ADVB': ADVB, 'LATN': LATN, 'NUMB': NUMB, 'intg': intg, 'real': real, } self.words_checked_count = None
def save_error(self): Base().connection() e_type, e_value, e_traceback = sys.exc_info() error = NormalizePublicationError.objects.create( error=traceback.format_exception(e_type, e_value, e_traceback))
def save_error(self): Base().connection() e_type, e_value, e_traceback = sys.exc_info() error = VocabularyError.objects.create( error=traceback.format_exception(e_type, e_value, e_traceback))
def __init__(self): self.publications_count = 400 self.name = 'Копирование' self.file_name = 'copy_publications' self.base_dir = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) self.pids_dir = os.path.join(self.base_dir, 'daemons/pids') self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses') self.context = Base().create_daemon_context(self.file_name) self.publication_table_columns = [ 'crawler__id', 'crawler__name', 'crawler__name_cyrillic', 'title', 'text', 'date', 'author', ] self.copypublication_table_columns = [ 'crawler_id', 'title', 'text', 'date', ]
def run_daemon(self): try: self.context.open() with self.context: while True: Base().update_working_status(self, 'waiting') can_program = Base().can_program(self) if can_program: Base().update_working_status(self, 'working') self.start() Base().update_working_status(self, 'waiting') Base().update_pidfile(self) time.sleep(300) else: time.sleep(300) except Exception: self.save_error()
def get_last_error(self): Base().connection() try: max_date = NormalizePublicationError.objects.all().aggregate( Max('date'))['date__max'] last_error = NormalizePublicationError.objects.get(date=max_date) except: last_error = 'no status' return last_error
def save_status(self, count): Base().connection() if count > 0: status = 'Ok' else: status = 'Empty' MakeHashesStatus.objects.create(status=status, count=count)
def get_date(self): Base().connection() try: date = CopyPublication.objects.all().aggregate( \ Max('date'))['date__max'] except: date = None return date
def get_last_pcopy_id(self): Base().connection() try: last_pcopy = NormalizePublication.objects.all().aggregate( \ Max('CopyPublication_id'))['CopyPublication_id__max'] except: last_pcopy = None return last_pcopy
def save_status(self, count): Base().connection() if count > 0: status = 'Ok' else: status = 'Empty' CopyPublicationStatus.objects.create(status=status, count=count)
def __remove_already_have(self, vocabulary): Base().connection() for key, value in vocabulary.items(): doubles = self.voc_models[key].objects.filter( name__in=vocabulary[key]).values('name') for double in doubles: self.__remove_from_array_by_value(vocabulary[key], double['name'])
def __init__(self): self.pub_without_status_length = 100 self.retrospective_days_delta = 10 self.name = 'Поиск нечетких дубликатов' self.file_name = 'pubcompare' self.base_dir = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) self.pids_dir = os.path.join(self.base_dir, 'daemons/pids') self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses') self.context = Base().create_daemon_context(self.file_name)
def save_status(self, count): Base().connection() if not count is None: status = 'Ok' else: status = 'Empty' count = 0 VocabularyStatus.objects.create(status=status, count=count)
def get_last_error(self): Base().connection() try: max_date = VocabularyError.objects.all().aggregate( Max('date'))['date__max'] last_error = VocabularyError.objects.get(date=max_date) except: last_error = 'no status' return last_error
def get_last_status(self): Base().connection() try: max_date = CopyPublicationStatus.objects.all().aggregate( Max('date'))['date__max'] last_status = CopyPublicationStatus.objects.get(date=max_date) except: last_status = 'no status' return last_status
def get_pcopy_list(self, last_pcopy): Base().connection() last_pcopy = self.get_last_pcopy_id() if last_pcopy != None: pcopy_list = CopyPublication.objects.filter( id__gt=last_pcopy).values( *self.copypublication_fields)[:self.list_value] else: pcopy_list = CopyPublication.objects.all().values( *self.copypublication_fields)[:self.list_value] return pcopy_list
def save_status(self, count): Base().connection() if count > 0: status = 'Ok' else: status = 'Empty' PubCompareStatus.objects.create( status=status, count=count, )
def __add_vocabulary_grammems_to_remove_to_db(self, grammems_to_remove): Base().connection() for key in grammems_to_remove: words = [] for word in grammems_to_remove[key]: words.append(self.grammems_to_remove_models[key]( name=word['word'], crc32=word['crc32'], )) if len(words) > 0: now = timezone.now() self.grammems_to_remove_models[key].objects.bulk_create(words)
def __remove_already_have_grammems_to_remove(self, grammems_to_remove): Base().connection() for key, value in grammems_to_remove.items(): doubles = self.grammems_to_remove_models[key].objects.filter( crc32__in=[word['crc32'] for word in grammems_to_remove[key] ]).values('crc32') for double in doubles: for key2, word in enumerate(value): if word['crc32'] == double['crc32']: del value[key2] break
def __add_vocabulary_to_db(self, vocabulary): Base().connection() for key in vocabulary: words = [] for word in vocabulary[key]: words.append(self.voc_models[key]( name=word, crc32=self.__convert_crc32(word), )) if len(words) > 0: now = timezone.now() self.voc_models[key].objects.bulk_create(words)
def start(self): Base().connection() for key, table in self.voc_models.items(): words = table.objects.filter( vikidict_scaned=False)[:self.list_value] if len(words) > 0: result = Vikidict().start(words) self.update_db(table, result) # сохраняем количество обработанных слов self.save_status(self.finded_synonims) self.finded_synonims = None
def start(self): Base().connection() for key, table in self.voc_models.items(): words = table.objects.filter( vikidict_correction_tested=False, Tone__isnull=True, )[:self.list_value] if len(words) > 0: result = VikidictCorr().start(words) self.update_db(table, result) # сохраняем количество обработанных слов self.save_status(self.words_checked_count) self.words_checked_count = None
def save(self, normalized_list): Base().connection() normalized_publications = [] for item in normalized_list: normalized_publications.append( NormalizePublication( crawler_id=item['crawler_id'], name=item['name'], name_cyrillic=item['name_cyrillic'], title=item['title'], text=item['text'], author=item['author'], pubdate=item['date'], CopyPublication_id=item['id'], title_words=item['title_words'], text_words=item['text_words'], )) count = len(normalized_publications) if count > 0: NormalizePublication.objects.bulk_create(normalized_publications) self.save_status(count)
def __init__(self): self.list_value = 400 self.name = 'Канонизация' self.file_name = 'normalize_publications' self.morth = pymorphy2.MorphAnalyzer() self.base_dir = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) self.pids_dir = os.path.join(self.base_dir, 'daemons/pids') self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses') self.context = Base().create_daemon_context(self.file_name) self.punctuations = re.compile( '([-_<>?/\\".„”“%,{}@#!&()=+:;«»—$&£*])') self.replace_with_spaces = { '\n', '\r', '\r\n', '\v', '\x0b', '\f', '\x0c', '\x1c', '\x1d', '\x1e', '\x85', '\u2028', '\u2029', '<br>', '<br />' '<p>', '</p>', '...', '\t', '\xa0', ' ', ' ', } self.copypublication_fields = [ 'crawler_id', 'name', 'name_cyrillic', 'title', 'text', 'author', 'date', 'id', ] self.grammems_to_remove = { 'NPRO', 'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ', 'ROMN', 'UNKN' } self.grammems_to_remove_vocabulary = { 'NPRO': [], 'PRED': [], 'PREP': [], 'CONJ': [], 'PRCL': [], 'INTJ': [], 'ROMN': [], 'UNKN': [], } self.grammems_to_remove_models = { 'NPRO': NPRO, 'PRED': PRED, 'PREP': PREP, 'CONJ': CONJ, 'PRCL': PRCL, 'INTJ': INTJ, 'ROMN': ROMN, 'UNKN': UNKN, } self.vocabulary = { 'NOUN': [], 'ADJF': [], 'ADJS': [], 'COMP': [], 'VERB': [], 'INFN': [], 'PRTF': [], 'PRTS': [], 'GRND': [], 'NUMR': [], 'ADVB': [], 'LATN': [], 'NUMB': [], 'intg': [], 'real': [], } self.voc_models = { 'NOUN': NOUN, 'ADJF': ADJF, 'ADJS': ADJS, 'COMP': COMP, 'VERB': VERB, 'INFN': INFN, 'PRTF': PRTF, 'PRTS': PRTS, 'GRND': GRND, 'NUMR': NUMR, 'ADVB': ADVB, 'LATN': LATN, 'NUMB': NUMB, 'intg': intg, 'real': real, }
def clear_vocabulary(self): Base().connection() for key, value in self.voc_models.items(): value.objects.all().delete()