def handle(self, *args, **options):
        to_export = NACPDeclaration.search().source(
            include=[AGGREGATED_FIELD_NAME]).query("exists",
                                                   field=AGGREGATED_FIELD_NAME)

        if not options["export_all"]:
            to_export = to_export.query(
                "bool",
                must=[Q("term", intro__doc_type="Щорічна")],
                must_not=[Q("exists", field="corrected_declarations")])

        if options["filter_future_declarations"]:
            to_export = to_export.query(
                "range", intro__declaration_year={"lt": datetime.now().year})

        w = None
        with open(options["destination"], "w") as fp:
            for i, d in enumerate(to_export.scan()):
                row = d[AGGREGATED_FIELD_NAME].to_dict()
                row['id'] = d.meta.id

                if not w:
                    w = DictWriter(fp, fieldnames=row.keys())
                    w.writeheader()

                w.writerow(row)
                if i % 10000 == 0 and i:
                    self.stdout.write("{} declarations exported".format(i))
    def get_raw_data(self, year, order_by, limit=10000):
        to_export = NACPDeclaration.search().source(
            include=[AGGREGATED_FIELD_NAME]).query("exists", field=AGGREGATED_FIELD_NAME)

        to_export = to_export.query(
            "bool",
            must=[
                Q("term", intro__doc_type="Щорічна"),
                Q("term", intro__declaration_year=year)
            ],
            must_not=[
                Q("exists", field="corrected_declarations"),
                Q("term", _id="nacp_e46bba0c-32d5-4b0d-a290-9fdc4afcc278"),  # F*****g Melnytchuk
                Q("term", _id="nacp_c67549d0-abc0-48fe-b529-9185efe1a3ce"),  # F*****g idiots
                Q("term", _id="nacp_2e07bb01-5ca8-4188-97c6-6297f7a4d2ad"),  # F*****g idiots
                Q("term", _id="nacp_f1b25e4d-e691-48d6-99b1-758e94764b91"), # F*****g Motsyor
                Q("term", **{"{}__outlier".format(AGGREGATED_FIELD_NAME): True})
            ]
        ).sort(
            {'{}.{}'.format(AGGREGATED_FIELD_NAME, order_by): {"order": "desc"}}
        )[:limit]

        res = []

        for d in to_export.execute():
            row = d[AGGREGATED_FIELD_NAME].to_dict()
            if row[order_by] > 10000000000:
                continue

            row["id"] = d._id
            res.append(row)

        return res
Пример #3
0
    def handle(self, *args, **options):
        all_decls = NACPDeclaration.search().query('match_all')
        if options["to"] is not None:
            all_decls = all_decls[options["from"]:options["to"]].execute()
        elif options["from"]:
            all_decls = all_decls[options["from"]:].execute()
        else:
            all_decls = all_decls.scan()

        w = DictWriter(options["outfile"],
                       fieldnames=["_id"] + options["field"])

        w.writeheader()

        for decl in all_decls:
            decl_dict = decl.to_dict()

            row = {
                field: self.fetch_field(decl_dict, field)
                for field in options["field"]
            }

            row["_id"] = decl.meta.id

            w.writerow(row)
Пример #4
0
    def get_raw_data(self, year, order_by, limit=10000):
        to_export = NACPDeclaration.search().source(
            include=[AGGREGATED_FIELD_NAME]).query("exists",
                                                   field=AGGREGATED_FIELD_NAME)

        to_export = to_export.query(
            "bool",
            must=[
                Q("term", intro__doc_type="Щорічна"),
                Q("term", intro__declaration_year=year)
            ],
            must_not=[
                Q("exists", field="corrected_declarations"),
                Q("term", _id="nacp_e46bba0c-32d5-4b0d-a290-9fdc4afcc278"
                  ),  # F*****g Melnytchuk
                Q("term",
                  **{"{}__outlier".format(AGGREGATED_FIELD_NAME): True})
            ]).sort({'aggregated.{}'.format(order_by): {
                         "order": "desc"
                     }})[:limit]

        res = []

        for d in to_export.execute():
            row = d[AGGREGATED_FIELD_NAME].to_dict()
            row["id"] = d._id
            res.append(row)

        return res
    def handle(self, *args, **options):
        all_decls = (NACPDeclaration.search().query("match_all").source([
            "declaration.url",
            "intro.date",
            "intro.doc_type",
            "nacp_orig.step_1",
        ]))

        all_decls = all_decls.filter(
            "range",
            intro__date={
                "gte":
                date(options["year_since"], 1, 1),
                "lt":
                datetime.now().replace(hour=0,
                                       minute=0,
                                       second=0,
                                       microsecond=0),
            },
        )

        w = DictWriter(
            options["outfile"],
            fieldnames=[
                "id",
                "declaration.url",
                "intro.date",
                "intro.doc_type",
                "nacp_orig.step_1.postCategory",
                "nacp_orig.step_1.postType",
            ],
        )

        for decl in tqdm(all_decls.scan(), total=all_decls.count()):
            w.writerow({
                "id":
                decl.meta.id,
                "declaration.url":
                decl.declaration.url,
                "intro.date":
                decl.intro.date.date(),
                "intro.doc_type":
                decl.intro.doc_type,
                "nacp_orig.step_1.postCategory":
                getattr(decl.nacp_orig.step_1, "postCategory", ""),
                "nacp_orig.step_1.postType":
                getattr(decl.nacp_orig.step_1, "postType", ""),
            })
Пример #6
0
    def handle(self, *args, **options):
        try:
            base_dir = options['file_path']
            corrected_file = options['corrected_file']
        except IndexError:
            raise CommandError(
                'First argument must be a path to source files and second is file name of CSV with corrected declarations')

        self.stdout.write("Gathering JSON documents from {}".format(base_dir))
        self.jsons = list(glob2.glob(os.path.join(base_dir, "**/*.json")))
        self.stdout.write("Gathered {} JSON documents".format(len(self.jsons)))

        corrected = set()
        with open(corrected_file, "r") as fp:
            r = DictReader(fp)
            for l in r:
                corrected.add(l["uuid"])

        DeclarationStaticObj.corrected = corrected

        NACPDeclaration.init()
        counter = 0

        my_tiny_pool = Pool(self.number_of_processes)

        if not options["update_all_docs"]:
            self.stdout.write("Obtaining uuids of already indexed documents")

            s = NACPDeclaration.search().source([])
            existing_guids = set(
                h.meta.id.replace("nacp_", "") for h in s.scan())
            self.stdout.write("{} uuids are currently in index".format(
                len(existing_guids)))

            incoming_files = dict(
                filter(
                    None,
                    my_tiny_pool.map(parse_guid_from_fname, self.jsons)
                )
            )

            incoming_guids = set(incoming_files.keys())

            self.stdout.write("{} uuids are found in input folder".format(
                len(incoming_guids)))

            self.jsons = [
                incoming_files[k] for k in incoming_guids - existing_guids
            ]

            self.stdout.write("{} uuids left after the filtering".format(
                len(self.jsons)))

        for ix in range(0, len(self.jsons), self.chunk_size):
            chunk = self.jsons[ix:ix + self.chunk_size]

            result = list(
                filter(
                    None,
                    my_tiny_pool.map(DeclarationStaticObj.parse, chunk)
                )
            )

            counter += len(result)

            bulk(self.es, result)

            if ix:
                self.stdout.write(
                    'Loaded {} items to persistence storage'.format(ix))

        self.stdout.write(
            'Finished loading {} items to persistence storage'.format(counter))
Пример #7
0
    def pull_declarations(self):
        def get_search_clause(kwd):
            if "область" not in kwd:
                return Q(
                    "multi_match",
                    query=kwd,
                    operator="or",
                    minimum_should_match=1,
                    fields=[
                        "general.post.region",
                        "general.post.office",
                        "general.post.post",
                        "general.post.actual_region",
                    ],
                )
            else:
                return Q(
                    "multi_match",
                    query=kwd,
                    fields=[
                        "general.post.region", "general.post.actual_region"
                    ],
                )

        search_clauses = [
            get_search_clause(x) for x in filter(
                None, map(str.strip, self.body.keywords.split("\n")))
        ]

        q = "{} {}".format(self.name, self.extra_keywords)

        if search_clauses:
            for sc in search_clauses:
                first_pass = (NACPDeclaration.search().query(
                    "bool",
                    must=[
                        Q(
                            "match",
                            general__full_name={
                                "query": q,
                                "operator": "and"
                            },
                        )
                    ],
                    should=[sc],
                    minimum_should_match=1,
                )[:100].execute())

                if first_pass:
                    break
        else:
            first_pass = (NACPDeclaration.search().query(
                "bool",
                must=[
                    Q("match",
                      general__full_name={
                          "query": q,
                          "operator": "and"
                      })
                ],
            )[:100].execute())

        Declaration.objects.create_declarations(self, first_pass)

        user_declarant_ids = set(
            filter(
                None,
                self.declarations.exclude(exclude=True).values_list(
                    "user_declarant_id", flat=True),
            ))

        if user_declarant_ids:
            second_pass = NACPDeclaration.search().filter(
                "terms",
                **{"intro.user_declarant_id": list(user_declarant_ids)})

            second_pass = second_pass.execute()

        if not user_declarant_ids or not second_pass:
            obj_ids_to_find = set(
                chain(*self.declarations.exclude(
                    exclude=True).values_list("obj_ids", flat=True)))

            second_pass = NACPDeclaration.search().query(
                "bool",
                must=[
                    Q("match",
                      general__full_name={
                          "query": q,
                          "operator": "or"
                      }),
                    Q("match", obj_ids=" ".join(list(obj_ids_to_find)[:512])),
                ],
                should=[],
                minimum_should_match=0,
            )[:100]

            second_pass = second_pass.execute()

        Declaration.objects.create_declarations(self, second_pass)
Пример #8
0
    def handle(self, *args, **options):
        corrected = NACPDeclaration.search().filter("term",
                                                    intro__corrected=True)

        cntr = 0
        success_rate = 0
        for i, d in enumerate(corrected.scan()):
            must = [
                ConstantScore(query=Q(
                    "multi_match",
                    query=d.general.full_name,
                    operator="and",
                    fields=[
                        "general.last_name",
                        "general.name",
                        "general.patronymic",
                        "general.full_name",
                    ],
                ),
                              boost=10)
            ]

            should = [
                ConstantScore(query=Q(
                    "match",
                    general__post__post={
                        "query": d.general.post.post,
                        "minimum_should_match": "50%"
                    },
                ),
                              boost=2),
                ConstantScore(query=Q(
                    "match",
                    general__post__office={
                        "query": d.general.post.office,
                        "minimum_should_match": "50%"
                    },
                ),
                              boost=2),
                ConstantScore(query=Q(
                    "match",
                    general__post__region={
                        "query": d.general.post.region.replace(" область", ""),
                        "minimum_should_match": "60%"
                    },
                ),
                              boost=1)
            ]

            for fam in getattr(d.general, "family", []):
                should.append(
                    ConstantScore(query=Q(
                        "multi_match",
                        query=fam.family_name,
                        operator="and",
                        fields=["general.family.family_name"]),
                                  boost=2))

            candidates = NACPDeclaration.search() \
                .query(
                    FunctionScore(
                        query=Q("bool", must=must, should=should),
                        score_mode="sum"
                    )
                ) \
                .filter("term",
                    intro__declaration_year=d.intro.declaration_year) \
                .query(~Q('term', _id=d.meta.id)) \
                .filter("term", intro__corrected=False) \
                .query(
                    ConstantScore(
                        query=Q("term", intro__doc_type=d.intro.doc_type),
                        boost=0
                    )
                )

            if options["store_matches"]:
                candidates = candidates \
                    .highlight_options(
                        order='score', fragment_size=500,
                        number_of_fragments=100, pre_tags=['||!'],
                        post_tags=["||"]) \
                    .highlight(
                        "general.full_name", "general.post.region",
                        "general.post.office", "general.post.post",
                        "general.family.family_name")

            candidates = candidates.execute()

            success = self.store_example(
                d,
                candidates,
                debug=options["debug"],
                store_matches=options["store_matches"])

            if success:
                success_rate += 1

            cntr += 1

            if cntr and cntr % 5000 == 0:
                self.stdout.write("%s declarations processed, SR: %s%%" %
                                  (cntr, success_rate / cntr * 100))

        self.stdout.write("%s declarations processed, SR: %s%%" %
                          (cntr, success_rate / cntr * 100))

        if options["store_matches"]:
            self.save_to_excel(options["store_matches"])