Exemplo n.º 1
0
def transform(r):
    if r.get('page'):
        r['haspdf'] = 1
    else:
        r['haspdf'] = 0

    try:
        r['citecnt'] = len(r['citedby'])
        del r['citedby']
    except:
        r['citecnt'] = 0
    if 'author' in r:
        r['author'] = [title_beautify(x) for x in r['author']]
    r['title'] = title_beautify(r['title'])
    return r
Exemplo n.º 2
0
def transform(r):
    if r.get('page'):
        r['haspdf'] = 1
    else:
        r['haspdf'] = 0

    try:
        r['citecnt'] = len(r['citedby'])
        del r['citedby']
    except:
        r['citecnt'] = 0
    if 'author' in r:
        r['author'] = [title_beautify(x) for x in r['author']]
    r['title'] = title_beautify(r['title'])
    return r
Exemplo n.º 3
0
def download():
    pid = long(request.values.get('pid'))
    agent = str(request.user_agent)
    db = get_mongo('paper')


    doc = db.find_and_modify(query={'_id': pid},
                             update={'$inc': {'download_cnt': 1}},
                             fields={'pdf': 1, 'title': 1}
                            )
    title = title_beautify(doc['title'])
    if not doc:
        return make_response(''), 404
    data = doc['pdf']
    resp = make_response(data)
    resp.headers['Content-Type'] = 'application/pdf'


    # chrome doesn't work with comma in filename
    #if agent.find('Chrom') != -1:
        #title = title.replace(',', ' ')

    # TODO deal with unicode name!
    resp.headers['Content-Disposition'] = \
            'attachment; filename="{0}.pdf"'.format(title)
    return resp
Exemplo n.º 4
0
 def __init__(self, query):
     query = title_beautify(query)
     self.query = query
     self.success = False
     self.title = query
     self.existing = None
     self.meta = {}
Exemplo n.º 5
0
def download():
    pid = long(request.values.get('pid'))
    agent = str(request.user_agent)
    db = get_mongo('paper')

    doc = db.find_and_modify(query={'_id': pid},
                             update={'$inc': {
                                 'download_cnt': 1
                             }},
                             fields={
                                 'pdf': 1,
                                 'title': 1
                             })
    title = title_beautify(doc['title'])
    if not doc:
        return make_response(''), 404
    data = doc['pdf']
    resp = make_response(data)
    resp.headers['Content-Type'] = 'application/pdf'

    # chrome doesn't work with comma in filename
    #if agent.find('Chrom') != -1:
    #title = title.replace(',', ' ')

    # TODO deal with unicode name!
    resp.headers['Content-Disposition'] = \
            'attachment; filename="{0}.pdf"'.format(title)
    return resp
Exemplo n.º 6
0
 def get_title(self):
     if self.title is not None:
         if not self.title:
             return None
         else:
             return self.title
     try:
         self.title = title_beautify(self._do_get_title())
         return self.title
     except:
         self.title = ""
         return None
Exemplo n.º 7
0
 def call(query):
     res = func(query.lower())
     for k in res:
         k['title'] = title_beautify(k['title'])
     return res
Exemplo n.º 8
0
    query = query.strip().lower()
    for cand in all_titles:
        dist = levenshtein(query, cand[0])
        if dist < 3:
            ret.append((cand, dist))
    if not ret:
        return None
    res = max(ret, key=operator.itemgetter(1))

    db = get_mongo('paper')
    res = db.find_one({'_id': res[0][1]}, SEARCH_RETURN_FIELDS)
    return res


def add_title_for_similar_search(cand):
    """ cand = (title, id) """
    all_titles.append((cand[0].strip().lower(), cand[1]))

def init_title_for_similar_search():
    if len(all_titles) > 0:
        return
    db = get_mongo('paper')
    itr = db.find({}, {'title': 1})
    for cand in itr:
        add_title_for_similar_search((cand['title'], cand['_id']))

init_title_for_similar_search()

if __name__ == '__main__':
    print search_exact(title_beautify('Intriguing properties of neural networks'))
Exemplo n.º 9
0
def search(ctx):
    query = ctx.query.lower()

    ret = {}
    ret['ctx_update'] = {}
    srs = []

    r = requests.get(GOOGLE_SCHOLAR_URL.format(query))
    text = r.text.encode('utf-8')
    #with open('/tmp/b.html', 'r') as f:
        #text = f.read()

    def find_citecnt(dom):
        try:
            find = dom.findAll(attrs={'class': 'gs_ri'})[0]
            find = find.findAll(attrs={'class': 'gs_fl'})[0]
            find = find.findAll('a')[0].text
            cnt = re.search('[0-9]+', find).group()
            return int(cnt)
        except:
            return None


    soup = BeautifulSoup(text, BS_PARSER)
    results = soup.findAll(attrs={'class': 'gs_r'})
    title_updated = None
    for rst in results:
        try:
            h3 = rst.findAll('h3')[0]
            real_title = h3.get_text()
            real_title = filter_title_fileformat(real_title)
            tc = title_correct(query, real_title)
            if not tc[0]:
                continue
            if not title_updated and tc[1]:
                title_updated = ensure_unicode(title_beautify(real_title))
                while True:     # fix things like '[citation][c] Title'
                    new_title = re.sub('^\[[^\]]*\]', '', title_updated).strip()
                    if new_title == title_updated:
                        title_updated = new_title
                        break
                    title_updated = new_title
                log_info(u"Title updated: {0}".format(title_updated))
                ret['ctx_update']['title'] = title_updated

            cnt = find_citecnt(rst)
            if cnt is not None:
                ret['ctx_update']['citecnt'] = cnt

            try:
                url = str(h3.find('a').get('href'))
                srs.append(SearchResult(None, url))
            except:
                pass

            findpdf = rst.findAll(attrs={'class': 'gs_ggs'})
            if findpdf:
                pdflink = findpdf[0].find('a').get('href')
                url = str(pdflink)
                srs.append(SearchResult('directpdf', url))
        except Exception as e:
            log_exc("Search Item parse error: {0}".format(str(e)))
    ret['results'] = srs
    return ret
Exemplo n.º 10
0
def handle_title_query(query):
    query = title_beautify(query)
    log_info("Get title query: {0}".format(query))

    #starts search
    res = search_startswith(query)  # and the idf is large
    if res:
        log_info("Found {0} results in db: {1}".format(
            len(res), str([x['_id'] for x in res])))
        return res
    # similar search
    res = similar_search(query)
    if res:
        log_info(u"Found similar results in db: {0}".format(res['_id']))
        return [res]

    # search on web
    searchers = searcher.register_searcher.get_searcher_list()
    parsers = fetcher.register_parser.get_parser_list()
    ctx = JobContext(query)

    args = zip(searchers, [ctx] * len(searchers))
    pool = Pool()
    async_results = [pool.apply_async(searcher_run, arg) for arg in args]

    # Search and get all the results item
    all_search_results = []
    for s in async_results:
        s = s.get(ukconfig.PYTHON_POOL_TIMEOUT)
        if s is None:
            continue
        srs = s['results']

        # try search database with updated title
        try:
            updated_title = s['ctx_update']['title']
        except KeyError:
            pass
        else:
            if updated_title != query:
                query = updated_title
                res = search_exact(query)
                if res:
                    log_info("Found {0} results in db: {1}".format(
                        len(res), str([x['_id'] for x in res])))
                    return res
        all_search_results.extend(srs)

        meta = s.get('ctx_update')
        if meta:
            log_info('Meat update from searcher: {0}'.format(str(meta.keys())))
            ctx.update_meta_dict(meta)
    pool.close()
    pool.terminate()

    # Analyse each result and try to parse info
    download_candidates = []
    parser_used = set()
    found = False
    for sr in all_search_results:
        for parser in parsers:
            if parser.can_handle(sr):
                download_candidates.append((parser, sr))
                if ctx.need_field(parser.support_meta_field):
                    # Already tried this fetcher
                    if not parser.repeatable and \
                            parser.name in parser_used:
                        continue
                    else:
                        parser_used.add(parser.name)

                    succ = parser.fetch_info(ctx, sr)
                    if not succ:
                        continue
                    found = True
                    if ctx.existing is not None:
                        log_info("Found {0} results in db".format(
                            len(ctx.existing)))
                        return [ctx.existing]

    # no metadata or downloadable source found
    if not found and len(download_candidates) == 0:
        return None
    # Save data, return data and start downloading
    try:
        pid = new_paper(ctx)
        ret = [{
            '_id': pid,
            'title': ctx.title,
            'view_cnt': 1,
            'download_cnt': 0
        }]
        ret[0].update(ctx.meta)

        progress_dict[pid] = 0.0
        if len(download_candidates) > 0:
            thread = Thread(target=start_download,
                            args=(download_candidates, ctx, pid))
            thread.start()
        return ret
    except:
        log_exc("Failed to save to db")
Exemplo n.º 11
0
def handle_title_query(query):
    query = title_beautify(query)
    log_info("Get title query: {0}".format(query))

     #starts search
    res = search_startswith(query) # and the idf is large
    if res:
        log_info("Found {0} results in db: {1}".format(
            len(res), str([x['_id'] for x in res])))
        return res
    # similar search
    res = similar_search(query)
    if res:
        log_info(u"Found similar results in db: {0}".format(res['_id']))
        return [res]

    # search on web
    searchers = searcher.register_searcher.get_searcher_list()
    parsers = fetcher.register_parser.get_parser_list()
    ctx = JobContext(query)

    args = zip(searchers, [ctx] * len(searchers))
    pool = Pool()
    async_results = [pool.apply_async(searcher_run, arg) for arg in args]

    # Search and get all the results item
    all_search_results = []
    for s in async_results:
        s = s.get(ukconfig.PYTHON_POOL_TIMEOUT)
        if s is None:
            continue
        srs = s['results']

        # try search database with updated title
        try:
            updated_title = s['ctx_update']['title']
        except KeyError:
            pass
        else:
            if updated_title != query:
                query = updated_title
                res = search_exact(query)
                if res:
                    log_info("Found {0} results in db: {1}".format(
                        len(res), str([x['_id'] for x in res])))
                    return res
        all_search_results.extend(srs)

        meta = s.get('ctx_update')
        if meta:
            log_info('Meat update from searcher: {0}'.format(str(meta.keys())))
            ctx.update_meta_dict(meta)
    pool.close()
    pool.terminate()

    # Analyse each result and try to parse info
    download_candidates = []
    parser_used = set()
    found = False
    for sr in all_search_results:
        for parser in parsers:
            if parser.can_handle(sr):
                download_candidates.append((parser, sr))
                if ctx.need_field(parser.support_meta_field):
                    # Already tried this fetcher
                    if not parser.repeatable and \
                            parser.name in parser_used:
                        continue
                    else:
                        parser_used.add(parser.name)

                    succ = parser.fetch_info(ctx, sr)
                    if not succ:
                        continue
                    found = True
                    if ctx.existing is not None:
                        log_info("Found {0} results in db".format(len(ctx.existing)))
                        return [ctx.existing]

    # no metadata or downloadable source found
    if not found and len(download_candidates) == 0:
        return None
    # Save data, return data and start downloading
    try:
        pid = new_paper(ctx)
        ret = [{'_id': pid,
                'title': ctx.title,
                'view_cnt': 1,
                'download_cnt': 0
               }]
        ret[0].update(ctx.meta)

        progress_dict[pid] = 0.0
        if len(download_candidates) > 0:
            thread = Thread(target=start_download, args=(download_candidates,
                                                         ctx, pid))
            thread.start()
        return ret
    except:
        log_exc("Failed to save to db")