def run(input_filename, output_filename): articles = defaultdict(set) without_identifiers = set() reader = csv.reader(open(input_filename, 'r')) try: biggest = 0 for i, article in enumerate(reader): article = Article(*article) identifiers = [(k,v) for k,v in article._asdict().items() if k in IDENTIFIERS and v] data = None # dict(identifiers) if not identifiers: without_identifiers.add(article.id) continue articles[identifiers[0]].add(article.id) for identifier in identifiers[1:]: if articles[identifiers[0]] is not articles[identifier]: articles[identifiers[0]] |= articles[identifier] articles[identifier] = articles[identifiers[0]] if len(articles[identifier]) > biggest: biggest = len(articles[identifier]) if i % 10000 == 0: print "%7d" % i, resource.getrusage(resource.RUSAGE_SELF)[2], biggest if resource.getrusage(resource.RUSAGE_SELF)[2] > 1e7: print "Using too much memory" raise Exception except Exception, e: print e
def run(input_filename, output_filename): articles = defaultdict(set) without_identifiers = set() reader = csv.reader(open(input_filename, 'r')) try: biggest = 0 for i, article in enumerate(reader): article = Article(*article) identifiers = [(k, v) for k, v in article._asdict().items() if k in IDENTIFIERS and v] data = None # dict(identifiers) if not identifiers: without_identifiers.add(article.id) continue articles[identifiers[0]].add(article.id) for identifier in identifiers[1:]: if articles[identifiers[0]] is not articles[identifier]: articles[identifiers[0]] |= articles[identifier] articles[identifier] = articles[identifiers[0]] if len(articles[identifier]) > biggest: biggest = len(articles[identifier]) if i % 10000 == 0: print "%7d" % i, resource.getrusage( resource.RUSAGE_SELF)[2], biggest if resource.getrusage(resource.RUSAGE_SELF)[2] > 1e7: print "Using too much memory" raise Exception except Exception, e: print e
from model import Article from recluster import recluster IDENTIFIERS = ('pmid', 'doi') articles = defaultdict(list) without_identifiers = 0 reader = csv.reader(open('../parsed/articles.csv', 'r')) writer = csv.writer(open('../parsed/clustered.csv', 'w')) try: for i, article in enumerate(reader): article = Article(*article) identifiers = [(k,v) for k,v in article._asdict().items() if k in IDENTIFIERS and v] data = None # dict(identifiers) if not identifiers: without_identifiers += 1 continue articles[identifiers[0]].append(article) for identifier in identifiers[1:]: if articles[identifiers[0]] is not articles[identifier]: articles[identifiers[0]] += articles[identifier] articles[identifier] = articles[identifiers[0]] if i % 10000 == 0: print "%7d" % i except: pass
from model import Article from recluster import recluster IDENTIFIERS = ('pmid', 'doi') articles = defaultdict(list) without_identifiers = 0 reader = csv.reader(open('../parsed/articles.csv', 'r')) writer = csv.writer(open('../parsed/clustered.csv', 'w')) try: for i, article in enumerate(reader): article = Article(*article) identifiers = [(k, v) for k, v in article._asdict().items() if k in IDENTIFIERS and v] data = None # dict(identifiers) if not identifiers: without_identifiers += 1 continue articles[identifiers[0]].append(article) for identifier in identifiers[1:]: if articles[identifiers[0]] is not articles[identifier]: articles[identifiers[0]] += articles[identifier] articles[identifier] = articles[identifiers[0]] if i % 10000 == 0: print "%7d" % i except: pass