def preprocessing_raw_data(self): batch_size = 10000 def is_kazakh(text): return sum([c in "ӘәҒғҚқҢңӨөҰұҮүІі" for c in text]) > 0.07 qs = Document.objects.filter(id__gt=0).order_by('id') number_of_documents = qs.count() for i, batch in enumerate(batch_qs(qs, batch_size=batch_size)): print(f"Processing {i*batch_size}/{number_of_documents}") for j, doc in enumerate(batch): if i == 0: print(f"{j}/{batch_size}") if "<" in doc.text or ">" in doc.text or "<" in doc.title or ">" in doc.title: doc.text = BeautifulSoup( doc.text, "html.parser").text.strip().replace('\n', '') doc.title = BeautifulSoup( doc.title, "html.parser").text.strip().replace('\n', '') Document.objects.bulk_update(batch, fields=['text', 'title']) for i, batch in enumerate(batch_qs(qs, batch_size=batch_size)): print(f"Deleting {i*batch_size}/{number_of_documents}") for doc in batch: if is_kazakh(doc.text + doc.title): doc.delete()
def document_generator(qs): for batch in batch_qs(qs, batch_size=batch_size): for document in batch: obj = ESDocument() obj.init_from_model(document) obj = obj.to_dict() obj['corpus'] = f"hate_{obj['class_label']}" if random.randint(1, 100) <= percent_test: obj['corpus'] = "hate_test" yield obj
def init_document_datetime_activity_parsed(apps, schema_editor): MyModel = apps.get_model('mainapp', 'Document') qs = MyModel.objects.exclude(datetime=None).order_by('id').only('datetime_activity_parsed', 'datetime_created',) qs = qs.exclude(num_views=None) qs = qs.order_by('id') number_of_documents = qs.count() batch_size = 10000 for i, batch in enumerate(batch_qs(qs, batch_size=batch_size)): print(f"Processing {i * batch_size}/{number_of_documents}") for j, doc in enumerate(batch): if i == 0: print(f"{j}/{batch_size}") doc.datetime_activity_parsed = doc.datetime_created MyModel.objects.bulk_update(batch, fields=['datetime_activity_parsed'])
def copy_date(apps, schema_editor): MyModel = apps.get_model('mainapp', 'Document') qs = MyModel.objects.exclude(datetime=None).order_by('id').only( 'datetime', 'date', ) number_of_documents = qs.count() batch_size = 10000 for i, batch in enumerate(batch_qs(qs, batch_size=batch_size)): print(f"Processing {i * batch_size}/{number_of_documents}") for j, doc in enumerate(batch): if i == 0: print(f"{j}/{batch_size}") doc.date = doc.datetime.date() MyModel.objects.bulk_update(batch, fields=['date'])
import datetime from mainapp.models import * from mainapp.services import batch_qs batch_size = 10000 qs = Document.objects.filter(id__gt=0).order_by('id') number_of_documents = qs.count() for i, batch in enumerate(batch_qs(qs, batch_size=batch_size)): print(f"Processing {i*batch_size}/{number_of_documents}") for j, doc in enumerate(batch): if i == 0: print(f"{j}/{batch_size}") if doc.datetime and doc.datetime.date() > datetime.datetime.now().date( ): actual_date = doc.datetime + datetime.timedelta(hours=6) if actual_date.day <= 12: doc.datetime = doc.datetime.replace(month=actual_date.day, day=actual_date.month) Document.objects.bulk_update(batch, fields=['datetime'])
def document_generator(self, qs): for batch in batch_qs(qs, batch_size=self.batch_size): for document in batch: obj = ESDocument() obj.init_from_model(document) yield obj.to_dict()