def run(input_filename, output_filename):
    articles = defaultdict(set)

    without_identifiers = set()

    reader = csv.reader(open(input_filename, 'r'))

    try:
        biggest = 0

        for i, article in enumerate(reader):
            article = Article(*article)
            identifiers = [(k,v) for k,v in article._asdict().items() if k in IDENTIFIERS and v]
            data = None # dict(identifiers)
            if not identifiers:
                without_identifiers.add(article.id)
                continue
            articles[identifiers[0]].add(article.id)
            for identifier in identifiers[1:]:
                if articles[identifiers[0]] is not articles[identifier]:
                    articles[identifiers[0]] |= articles[identifier]
                    articles[identifier] = articles[identifiers[0]]
                    if len(articles[identifier]) > biggest:
                        biggest = len(articles[identifier])

            if i % 10000 == 0:
                print "%7d" % i, resource.getrusage(resource.RUSAGE_SELF)[2], biggest
                if resource.getrusage(resource.RUSAGE_SELF)[2] > 1e7:
                    print "Using too much memory"
                    raise Exception
    except Exception, e:
        print e
def run(input_filename, output_filename):
    articles = defaultdict(set)

    without_identifiers = set()

    reader = csv.reader(open(input_filename, 'r'))

    try:
        biggest = 0

        for i, article in enumerate(reader):
            article = Article(*article)
            identifiers = [(k, v) for k, v in article._asdict().items()
                           if k in IDENTIFIERS and v]
            data = None  # dict(identifiers)
            if not identifiers:
                without_identifiers.add(article.id)
                continue
            articles[identifiers[0]].add(article.id)
            for identifier in identifiers[1:]:
                if articles[identifiers[0]] is not articles[identifier]:
                    articles[identifiers[0]] |= articles[identifier]
                    articles[identifier] = articles[identifiers[0]]
                    if len(articles[identifier]) > biggest:
                        biggest = len(articles[identifier])

            if i % 10000 == 0:
                print "%7d" % i, resource.getrusage(
                    resource.RUSAGE_SELF)[2], biggest
                if resource.getrusage(resource.RUSAGE_SELF)[2] > 1e7:
                    print "Using too much memory"
                    raise Exception
    except Exception, e:
        print e
Exemplo n.º 3
0
from model import Article
from recluster import recluster


IDENTIFIERS = ('pmid', 'doi')
articles = defaultdict(list)

without_identifiers = 0

reader = csv.reader(open('../parsed/articles.csv', 'r'))
writer = csv.writer(open('../parsed/clustered.csv', 'w'))

try:
    for i, article in enumerate(reader):
        article = Article(*article)
        identifiers = [(k,v) for k,v in article._asdict().items() if k in IDENTIFIERS and v]
        data = None # dict(identifiers)
        if not identifiers:
            without_identifiers += 1
            continue
        articles[identifiers[0]].append(article)
        for identifier in identifiers[1:]:
            if articles[identifiers[0]] is not articles[identifier]:
                articles[identifiers[0]] += articles[identifier]
                articles[identifier] = articles[identifiers[0]]

        if i % 10000 == 0:
            print "%7d" % i
except:
    pass
Exemplo n.º 4
0
from model import Article
from recluster import recluster

IDENTIFIERS = ('pmid', 'doi')
articles = defaultdict(list)

without_identifiers = 0

reader = csv.reader(open('../parsed/articles.csv', 'r'))
writer = csv.writer(open('../parsed/clustered.csv', 'w'))

try:
    for i, article in enumerate(reader):
        article = Article(*article)
        identifiers = [(k, v) for k, v in article._asdict().items()
                       if k in IDENTIFIERS and v]
        data = None  # dict(identifiers)
        if not identifiers:
            without_identifiers += 1
            continue
        articles[identifiers[0]].append(article)
        for identifier in identifiers[1:]:
            if articles[identifiers[0]] is not articles[identifier]:
                articles[identifiers[0]] += articles[identifier]
                articles[identifier] = articles[identifiers[0]]

        if i % 10000 == 0:
            print "%7d" % i
except:
    pass