Пример #1
0
    def process(self, data):
        registration = Registration.from_json(data)
        output = self.disposition(registration)
        if registration.uuid:
            self.output_for_uuid[registration.uuid] = output

        if registration.parent:
            # In the previous step, children were processed
            # immediately after their parents. That means they're
            # processed after their parents here.

            parent_output = self.output_for_uuid[registration.parent['uuid']]
            # In general, children are totally independent
            # registrations. However, if the 'parent' registration
            # (the one for which the most data is available) was
            # deemed to be out of range, there's a good chance the
            # 'children' are also out of range.
            if parent_output and output == self.in_range and parent_output != self.in_range:
                registration.disposition = "Classified with parent."
                registration.warnings.append(
                    "This registration seems to be in range, but it was associated with a registration which was a foreign publication or not in range. To be safe, this registration will be put in the same category as its 'parent'; it should be checked manually."
                )
                output = parent_output

        json.dump(registration.jsonable(require_disposition=True), output)
        output.write("\n")
        if penalty > 0:
            # Beyond "a couple typoes", the Levenshtein distance
            # basically means there's no match, so we cap the penalty
            # at a pretty low level.
            penalty = min(penalty, 0.20)
        return penalty


comparator = Comparator("output/ia-0-texts.ndjson")
output = open("output/ia-1-matched.ndjson", "w")

for filename in ["FINAL-not-renewed.ndjson"
                 ]:  #"FINAL-possibly-renewed.ndjson"]:
    for i in open("output/%s" % filename):
        cce = Registration.from_json(json.loads(i))
        title = cce.title
        if not title or not comparator.normalize(title):
            continue
        matches = list(comparator.matches(cce))

        # If there are a huge number of IA matches for a CCE title,
        # penalize them -- it's probably a big mess that must be dealt
        # with separately. Give a slight boost if there's only a single
        # match.
        if len(matches) == 1:
            num_matches_coefficient = 1.1
        elif len(matches) <= MATCH_CUTOFF:
            num_matches_coefficient = 1
        else:
            num_matches_coefficient = 1 - (len(matches) -
Пример #3
0
 def convert(self, input_file):
     self.out.writerow(Registration.csv_row_labels + Renewal.csv_row_labels)
     for line in open(input_file):
         registration = Registration.from_json(json.loads(line))
         self.out.writerow(registration.csv_row)