def run(input_filename, output_filename): tar = tarfile.open(input_filename, 'r:gz') record_file = tempfile.NamedTemporaryFile(delete=False) articles, without_identifiers = defaultdict(set), set() biggest = 0 try: for i, (_, record) in enumerate(get_records(tar, types=SPLIT_OUT)): identifiers = [tidy_identifier(k, record[k]) for k in record if k in IDENTIFIERS] identifiers = filter(lambda x:x[1], identifiers) if record['type'] == 'Organization': identifiers.append(('org', hash((record.get('name'), record.get('address'))))) if not identifiers: without_identifiers.add(record['id']) continue articles[identifiers[0]].add(record['id']) for identifier in identifiers[1:]: if articles[identifiers[0]] is not articles[identifier]: articles[identifiers[0]] |= articles[identifier] articles[identifier] = articles[identifiers[0]] if len(articles[identifier]) > biggest: biggest = len(articles[identifier]) if i % 10000 == 0: print "%7d %7d %7d %7d %10d" % (i, len(articles), len(without_identifiers), biggest, resource.getrusage(resource.RUSAGE_SELF)[2]) tar.members = [] except BaseException, e: traceback.print_exc()
def run(input_filename, output_filename): tar = tarfile.open(input_filename, 'r:gz') record_file = tempfile.NamedTemporaryFile(delete=False) articles, without_identifiers = defaultdict(set), set() biggest = 0 try: for i, (_, record) in enumerate(get_records(tar, types=SPLIT_OUT)): identifiers = [ tidy_identifier(k, record[k]) for k in record if k in IDENTIFIERS ] identifiers = filter(lambda x: x[1], identifiers) if record['type'] == 'Organization': identifiers.append( ('org', hash((record.get('name'), record.get('address'))))) if not identifiers: without_identifiers.add(record['id']) continue articles[identifiers[0]].add(record['id']) for identifier in identifiers[1:]: if articles[identifiers[0]] is not articles[identifier]: articles[identifiers[0]] |= articles[identifier] articles[identifier] = articles[identifiers[0]] if len(articles[identifier]) > biggest: biggest = len(articles[identifier]) if i % 10000 == 0: print "%7d %7d %7d %7d %10d" % ( i, len(articles), len(without_identifiers), biggest, resource.getrusage(resource.RUSAGE_SELF)[2]) tar.members = [] except BaseException, e: traceback.print_exc()
articles = dict((id(l), l) for l in articles.values()).values() groups = {} for i, article_list in enumerate(articles): for article_id in article_list: groups[article_id] = i for article in without_identifiers: i += 1 groups[article] = i del without_identifiers, articles RELATIONS = 'author editor translator'.split() try: for i, (index, record) in enumerate(get_records(tar, True, types=SPLIT_OUT)): #pprint.pprint(index) records, group = [], groups[record['id']] to_add, queue, data = set(), set([record['id']]), {'group': group, 'records': records} while queue: id_ = queue.pop() record = index[id_] to_add.add(id_) for id_ in itertools.chain(*[(lambda x:(x if isinstance(x, list) else [x]))(record.get(k, [])) for k in RELATIONS]): #print id_ id_ = id_['ref'][1:] if id_ not in to_add: if id_ in index and index[id_]['type'] not in SPLIT_OUT: queue.add(id_)
articles = dict((id(l), l) for l in articles.values()).values() groups = {} for i, article_list in enumerate(articles): for article_id in article_list: groups[article_id] = i for article in without_identifiers: i += 1 groups[article] = i del without_identifiers, articles RELATIONS = 'author editor translator'.split() try: for i, (index, record) in enumerate(get_records(tar, True, types=SPLIT_OUT)): #pprint.pprint(index) records, group = [], groups[record['id']] to_add, queue, data = set(), set([record['id']]), { 'group': group, 'records': records } while queue: id_ = queue.pop() record = index[id_] to_add.add(id_) for id_ in itertools.chain( *[(lambda x: (x if isinstance(x, list) else [x]) )(record.get(k, [])) for k in RELATIONS]): #print id_