Пример #1
0
def run(medline_path, clean, start, end, PROCESSES):
    con = 'postgresql://*****:*****@localhost/'+db

    if end != None:
        end = int(end)

    if clean:
        PubMedDB.create_tables(db)
    
    PubMedDB.init(db)

    paths = []
    for root, dirs, files in os.walk(medline_path):
        for filename in files:
            if os.path.splitext(filename)[-1] in [".xml", ".gz"]:
                paths.append(os.path.join(root,filename))

    paths.sort()
    

    pool = Pool(processes=PROCESSES)    # start with processors
    print "Initialized with ", PROCESSES, "processes"
    #result.get() needs global variable db now - that is why a line "db = options.database" is added in "__main__" - the variable db cannot be given to __start_parser in map_async()
    result = pool.map_async(_start_parser, paths[start:end])
    res = result.get()
    #without multiprocessing:
    #for path in paths:
    #    _start_parser(path)

    print "######################"
    print "###### Finished ######"
    print "######################"
Пример #2
0
def run(medline_path, clean, start, end, PROCESSES):
    con = 'postgresql://*****:*****@localhost/' + db

    if end != None:
        end = int(end)

    if clean:
        PubMedDB.create_tables(db)

    PubMedDB.init(db)

    paths = []
    for root, dirs, files in os.walk(medline_path):
        for filename in files:
            if os.path.splitext(filename)[-1] in [".xml", ".gz"]:
                paths.append(os.path.join(root, filename))

    paths.sort()

    pool = Pool(processes=PROCESSES)  # start with processors
    print "Initialized with ", PROCESSES, "processes"
    #result.get() needs global variable db now - that is why a line "db = options.database" is added in "__main__" - the variable db cannot be given to __start_parser in map_async()
    result = pool.map_async(_start_parser, paths[start:end])
    res = result.get()
    #without multiprocessing:
    #for path in paths:
    #    _start_parser(path)

    print "######################"
    print "###### Finished ######"
    print "######################"
Пример #3
0
    def run(self, medline_path, clean, start, end, PROCESSES):
        if end is not None:
            end = int(end)

        if clean:
            PubMedDB.create_tables(self.db_engine)

        paths = []
        for root, dirs, files in os.walk(medline_path):
            for filename in files:
                if os.path.splitext(filename)[-1] in [".xml", ".gz"]:
                    paths.append(os.path.join(root, filename))

        # Don't reload what we've already got
        with FilePreloadScreener(paths, self.db_engine) as screener:
            paths = screener.exclude_loaded_files(paths)

        paths.sort()

        print "Running for %d files" % (len(paths), )

        # result.get() needs global variable `db` now - that is why a line "db = options.database" is added in "__main__" -
        #  the variable db cannot be given to __start_parser in map_async()

        if PROCESSES > 1 and len(paths) > 1:

            from contextlib import closing

            with closing(Pool(processes=PROCESSES)) as pool:
                print "Running multi-process with %d processes" % (PROCESSES, )
                result = pool.map_async(_start_parser, paths[start:end])
                res = result.get()

        # without multiprocessing:
        else:
            print "Running single process"
            for path in paths:
                _start_parser(path)

        print "######################"
        print "###### Finished ######"
        print "######################"