示例#1
0
def main():
    global args
    args = get_args()
    query = args.title
    directory = args.directory


    searchers = searcher.register_searcher.get_searcher_list()
    parsers = fetcher.register_parser.get_parser_list()
    download_candidates = []
    if re.match('^http[s]?://', query):
        # skip search
        ctx = JobContext("")
        sr = SearchResult(None, query)
        for parser in parsers:
            if parser.can_handle(sr):
                parser.fetch_info(ctx, sr)      # will update title
                download_candidates.append((parser, sr))
    else:
        #query = "Distinctive image features from scale-invariant keypoint"
        ctx = JobContext(query)

        search_args = zip(searchers, [ctx] * len(searchers))
        pool = Pool()
        as_results = [pool.apply_async(searcher_run, arg) for arg in search_args]
        #results = [searcher_run(*arg) for arg in search_args]  # for debug

        for s in as_results:
            s = s.get()
            if s is None:
                continue
            ctx.update_meta_dict(s['ctx_update'])
            print s['ctx_update']
            ctx.try_update_title_from_search_result(s)

            for sr in s['results']:
                for parser in parsers:
                    if parser.can_handle(sr):
                        parser.fetch_info(ctx, sr)      # will update title
                        download_candidates.append((parser, sr))
        pool.terminate()

    download_candidates = sorted(
        download_candidates,
        key=lambda x: x[0].priority,
        reverse=True)

    for (parser, sr) in download_candidates:
        data = parser.download(sr)
        if not data:
            continue
        data = pdf_compress(data)
        if ctx.title:
            ctx.title = finalize_filename(ctx.title)
        else:
            log_info("Failed to guess paper title!")
            ctx.title = "Unnamed Paper {}".format(md5(data))

        filename = os.path.join(directory, ctx.title + ".pdf")
        if os.path.exists(filename):
            log_err("File \"{}\" exists! overwrite? (y/n)".format(os.path.basename(filename)))
            resp = raw_input()
            if resp not in ['y', 'Y']:
                log_info("No file written. Exiting...")
                break
        with open(filename, 'wb') as f:
            f.write(data)
        if args.output:
            os.rename(filename, args.output)
        log_info("Successfully downloaded to {0}".format(filename))
        break
    else:
        log_err("Failed to download {0}".format(ctx.title))
    if ctx.meta.get('bibtex'):
        log_info("Bibtex:\n{}".format(ctx.meta['bibtex']))
    if ctx.meta.get('author'):
        log_info("Author: {0}".format(ctx.meta['author']))
    if ctx.meta.get('citecnt'):
        log_info("Cite count: {0}".format(ctx.meta['citecnt']))
示例#2
0
# Author: Yuxin Wu <*****@*****.**>

import ukconfig
ukconfig.USE_DB = False
ukconfig.USE_INDEXER = False

from fetcher import register_parser, SearchResult
from job import JobContext
from ukdbconn import new_paper

import sys

if __name__ == '__main__':
    if len(sys.argv) == 2:
        ukconfig.USE_DB = True
    ctx = JobContext("Test Filename")

    parser = register_parser.parser_dict['arxiv.org']
    sr = SearchResult(None, "http://arxiv.org/abs/1312.6680")
    #sr = SearchResult(None, "  http://arxiv.org/abs/1404.3610")

    #parser = register_parser.parser_dict['dl.acm.org']
    #url = "http://dl.acm.org/citation.cfm?id=1859761"  # twitter
    #url = "http://dl.acm.org/citation.cfm?id=996342"    # SIFT # Large Number of cited
    #url = "http://dl.acm.org/citation.cfm?id=2366157"  # big
    #url = "http://dl.acm.org/citation.cfm?id=1656278"  # Weka
    #sr = SearchResult(None, url)

    #parser = register_parser.parser_dict['ieeexplore.ieee.org']
    #sr = SearchResult(None, "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=00726791")
    #sr = SearchResult(None, "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4244529")
示例#3
0
# Date: Tue Jan 20 14:22:43 2015 +0800
# Author: Yuxin Wu <*****@*****.**>

from multiprocessing import Pool
import sys

import searcher
from job import JobContext
from searcher import searcher_run

if __name__ == '__main__':
    query = sys.argv[1]
    searchers = searcher.register_searcher.get_searcher_list()
    searchers = searchers[1:]
    print[k.name for k in searchers]
    ctx = JobContext(query)

    args = zip(searchers, [ctx] * len(searchers))
    pool = Pool()
    async_results = [pool.apply_async(searcher_run, arg) for arg in args]

    # Search and get all the results item
    for s in async_results:
        s = s.get()
        if s is None:
            continue
        srs = s['results']

        print srs

        meta = s.get('ctx_update')
示例#4
0
def handle_title_query(query):
    query = title_beautify(query)
    log_info("Get title query: {0}".format(query))

    #starts search
    res = search_startswith(query)  # and the idf is large
    if res:
        log_info("Found {0} results in db: {1}".format(
            len(res), str([x['_id'] for x in res])))
        return res
    # similar search
    res = similar_search(query)
    if res:
        log_info(u"Found similar results in db: {0}".format(res['_id']))
        return [res]

    # search on web
    searchers = searcher.register_searcher.get_searcher_list()
    parsers = fetcher.register_parser.get_parser_list()
    ctx = JobContext(query)

    args = zip(searchers, [ctx] * len(searchers))
    pool = Pool()
    async_results = [pool.apply_async(searcher_run, arg) for arg in args]

    # Search and get all the results item
    all_search_results = []
    for s in async_results:
        s = s.get(ukconfig.PYTHON_POOL_TIMEOUT)
        if s is None:
            continue
        srs = s['results']

        # try search database with updated title
        try:
            updated_title = s['ctx_update']['title']
        except KeyError:
            pass
        else:
            if updated_title != query:
                query = updated_title
                res = search_exact(query)
                if res:
                    log_info("Found {0} results in db: {1}".format(
                        len(res), str([x['_id'] for x in res])))
                    return res
        all_search_results.extend(srs)

        meta = s.get('ctx_update')
        if meta:
            log_info('Meat update from searcher: {0}'.format(str(meta.keys())))
            ctx.update_meta_dict(meta)
    pool.close()
    pool.terminate()

    # Analyse each result and try to parse info
    download_candidates = []
    parser_used = set()
    found = False
    for sr in all_search_results:
        for parser in parsers:
            if parser.can_handle(sr):
                download_candidates.append((parser, sr))
                if ctx.need_field(parser.support_meta_field):
                    # Already tried this fetcher
                    if not parser.repeatable and \
                            parser.name in parser_used:
                        continue
                    else:
                        parser_used.add(parser.name)

                    succ = parser.fetch_info(ctx, sr)
                    if not succ:
                        continue
                    found = True
                    if ctx.existing is not None:
                        log_info("Found {0} results in db".format(
                            len(ctx.existing)))
                        return [ctx.existing]

    # no metadata or downloadable source found
    if not found and len(download_candidates) == 0:
        return None
    # Save data, return data and start downloading
    try:
        pid = new_paper(ctx)
        ret = [{
            '_id': pid,
            'title': ctx.title,
            'view_cnt': 1,
            'download_cnt': 0
        }]
        ret[0].update(ctx.meta)

        progress_dict[pid] = 0.0
        if len(download_candidates) > 0:
            thread = Thread(target=start_download,
                            args=(download_candidates, ctx, pid))
            thread.start()
        return ret
    except:
        log_exc("Failed to save to db")
示例#5
0
def main():
    global args
    args = get_args()
    query = args.title
    directory = args.directory

    searchers = searcher.register_searcher.get_searcher_list()
    parsers = fetcher.register_parser.get_parser_list()
    download_candidates = []
    if re.match('^http[s]?://', query):
        # skip search
        ctx = JobContext("")
        sr = SearchResult(None, query)
        for parser in parsers:
            if parser.can_handle(sr):
                parser.fetch_info(ctx, sr)  # will update title
                download_candidates.append((parser, sr))
    else:
        #query = "Distinctive image features from scale-invariant keypoint"
        ctx = JobContext(query)

        search_args = zip(searchers, [ctx] * len(searchers))
        pool = Pool()
        as_results = [
            pool.apply_async(searcher_run, arg) for arg in search_args
        ]
        #results = [searcher_run(*arg) for arg in search_args]  # for debug

        for s in as_results:
            s = s.get()
            if s is None:
                continue
            ctx.update_meta_dict(s['ctx_update'])
            print s['ctx_update']
            ctx.try_update_title_from_search_result(s)

            for sr in s['results']:
                for parser in parsers:
                    if parser.can_handle(sr):
                        parser.fetch_info(ctx, sr)  # will update title
                        download_candidates.append((parser, sr))
        pool.terminate()

    download_candidates = sorted(download_candidates,
                                 key=lambda x: x[0].priority,
                                 reverse=True)

    for (parser, sr) in download_candidates:
        data = parser.download(sr)
        if not data:
            continue
        data = pdf_compress(data)
        if ctx.title:
            ctx.title = finalize_filename(ctx.title)
        else:
            log_info("Failed to guess paper title!")
            ctx.title = "Unnamed Paper {}".format(md5(data))

        filename = os.path.join(directory, ctx.title + ".pdf")
        if os.path.exists(filename):
            log_err("File \"{}\" exists! overwrite? (y/n)".format(
                os.path.basename(filename)))
            resp = raw_input()
            if resp not in ['y', 'Y']:
                log_info("No file written. Exiting...")
                break
        with open(filename, 'wb') as f:
            f.write(data)
        if args.output:
            os.rename(filename, args.output)
        log_info("Successfully downloaded to {0}".format(filename))
        break
    else:
        log_err("Failed to download {0}".format(ctx.title))
    if ctx.meta.get('bibtex'):
        log_info("Bibtex:\n{}".format(ctx.meta['bibtex']))
    if ctx.meta.get('author'):
        log_info("Author: {0}".format(ctx.meta['author']))
    if ctx.meta.get('citecnt'):
        log_info("Cite count: {0}".format(ctx.meta['citecnt']))
示例#6
0
def handle_title_query(query):
    query = title_beautify(query)
    log_info("Get title query: {0}".format(query))

     #starts search
    res = search_startswith(query) # and the idf is large
    if res:
        log_info("Found {0} results in db: {1}".format(
            len(res), str([x['_id'] for x in res])))
        return res
    # similar search
    res = similar_search(query)
    if res:
        log_info(u"Found similar results in db: {0}".format(res['_id']))
        return [res]

    # search on web
    searchers = searcher.register_searcher.get_searcher_list()
    parsers = fetcher.register_parser.get_parser_list()
    ctx = JobContext(query)

    args = zip(searchers, [ctx] * len(searchers))
    pool = Pool()
    async_results = [pool.apply_async(searcher_run, arg) for arg in args]

    # Search and get all the results item
    all_search_results = []
    for s in async_results:
        s = s.get(ukconfig.PYTHON_POOL_TIMEOUT)
        if s is None:
            continue
        srs = s['results']

        # try search database with updated title
        try:
            updated_title = s['ctx_update']['title']
        except KeyError:
            pass
        else:
            if updated_title != query:
                query = updated_title
                res = search_exact(query)
                if res:
                    log_info("Found {0} results in db: {1}".format(
                        len(res), str([x['_id'] for x in res])))
                    return res
        all_search_results.extend(srs)

        meta = s.get('ctx_update')
        if meta:
            log_info('Meat update from searcher: {0}'.format(str(meta.keys())))
            ctx.update_meta_dict(meta)
    pool.close()
    pool.terminate()

    # Analyse each result and try to parse info
    download_candidates = []
    parser_used = set()
    found = False
    for sr in all_search_results:
        for parser in parsers:
            if parser.can_handle(sr):
                download_candidates.append((parser, sr))
                if ctx.need_field(parser.support_meta_field):
                    # Already tried this fetcher
                    if not parser.repeatable and \
                            parser.name in parser_used:
                        continue
                    else:
                        parser_used.add(parser.name)

                    succ = parser.fetch_info(ctx, sr)
                    if not succ:
                        continue
                    found = True
                    if ctx.existing is not None:
                        log_info("Found {0} results in db".format(len(ctx.existing)))
                        return [ctx.existing]

    # no metadata or downloadable source found
    if not found and len(download_candidates) == 0:
        return None
    # Save data, return data and start downloading
    try:
        pid = new_paper(ctx)
        ret = [{'_id': pid,
                'title': ctx.title,
                'view_cnt': 1,
                'download_cnt': 0
               }]
        ret[0].update(ctx.meta)

        progress_dict[pid] = 0.0
        if len(download_candidates) > 0:
            thread = Thread(target=start_download, args=(download_candidates,
                                                         ctx, pid))
            thread.start()
        return ret
    except:
        log_exc("Failed to save to db")
示例#7
0
# Date: Tue Jan 20 14:22:43 2015 +0800
# Author: Yuxin Wu <*****@*****.**>

from multiprocessing import Pool
import sys

import searcher
from job import JobContext
from searcher import searcher_run

if __name__ == '__main__':
    query = sys.argv[1]
    searchers = searcher.register_searcher.get_searcher_list()
    searchers = searchers[1:]
    print [k.name for k in searchers]
    ctx = JobContext(query)

    args = zip(searchers, [ctx] * len(searchers))
    pool = Pool()
    async_results = [pool.apply_async(searcher_run, arg) for arg in args]

    # Search and get all the results item
    for s in async_results:
        s = s.get()
        if s is None:
            continue
        srs = s['results']

        print srs

        meta = s.get('ctx_update')