Exemplo n.º 1
0
def filter_out_based_on_date_range(recids, fromdate="", untildate="", set_spec=None):
    """ Filter out recids based on date range."""
    if fromdate:
        fromdate = normalize_date(fromdate, "T00:00:00Z")
    else:
        fromdate = get_earliest_datestamp()
    fromdate = utc_to_localtime(fromdate)

    if untildate:
        untildate = normalize_date(untildate, "T23:59:59Z")
    else:
        untildate = get_latest_datestamp()
    untildate = utc_to_localtime(untildate)

    if set_spec is not None: ## either it has a value or it empty, thus meaning all records
        last_updated = get_set_last_update(set_spec)
        if last_updated is not None:
            last_updated = utc_to_localtime(last_updated)
            if last_updated > fromdate:
                fromdate = utc_to_localtime(get_earliest_datestamp())

    recids = intbitset(recids) ## Let's clone :-)

    if fromdate and untildate:
        recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date BETWEEN %s AND %s", (fromdate, untildate)))
    elif fromdate:
        recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date >= %s", (fromdate, )))
    elif untildate:
        recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date <= %s", (untildate, )))

    if cfg.get('CFG_OAI_FILTER_RESTRICTED_RECORDS', True):
        recids = recids - get_all_restricted_recids()

    return recids
Exemplo n.º 2
0
def burmeister(filename):
    with file(filename, "r") as f:
        # File format marker
        txt = f.readline().strip()
        if txt is not "B":
            raise Exception("Bad file format")
# The name of the context
        name = f.readline().strip()
        # Read the volume of the extent and the intent
        G = int(f.readline().strip())
        M = int(f.readline().strip())
        skip_empty(f)
        # Read the labels first the extent, last the intent
        extent = [f.readline().strip() for x in range(G)]
        intent = [f.readline().strip() for x in range(M)]
        skip_empty(f)
        # Load and process the object-attribute relationship,
        #   expect that it is stored extent-wise
        ctx0 = []
        for g in range(G):
            line = f.readline().rstrip()
            ctx0.append(
                bs.intbitset([m for m in range(M) if line[m] in "X"], M))


# Transpose the context
        ctxt = [
            bs.intbitset([g for g in range(G) if m in ctx0[g]], G)
            for m in range(M)
        ]
        return Context(extent, intent, (ctx0, ctxt), name)
Exemplo n.º 3
0
def get_citedby_hitset(ahitset, record_limit=None):
    """
    Return a hitset of records that are cited by records in the given
    ahitset. Useful for search engine's citedby:author:ellis feature.

    The parameter 'record_limit' is the maximum number of records of 'ahitset'
    to consider. If it is None (the default value) all the records will be
    used.
    """
    out = intbitset()
    if ahitset:
        try:
            iter(ahitset)
        except OverflowError:
            # ignore attempt to iterate over infinite ahitset
            pass
        else:
            # We don't want to overwrite the input parameter
            if record_limit is not None:
                limited_ahitset = ahitset[:record_limit]
            else:
                limited_ahitset = ahitset

            in_sql = ','.join('%s' for dummy in limited_ahitset)
            rows = run_sql("""SELECT citee FROM rnkCITATIONDICT
                              WHERE citer IN (%s)""" % in_sql, limited_ahitset)
            out = intbitset(rows)
    return out
Exemplo n.º 4
0
def query_records(params):
    """Produce record IDs from given query parameters.

    By passing the appriopriate CLI options, we can query here for additional
    records.
    """
    write_message("Querying database (records query)...")
    res = intbitset()
    if params['field'] or params['collection'] or params['pattern']:

        if not params['collection']:
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=params['pattern'],
                                 f=params['field'],
                                 m=params['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(
                perform_request_search(req=None,
                                       of='id',
                                       c=params['collection'],
                                       p=params['pattern'],
                                       f=params['field']))
    return res
Exemplo n.º 5
0
def query_records(params):
    """Produce record IDs from given query parameters.

    By passing the appriopriate CLI options, we can query here for additional
    records.
    """
    write_message("Querying database (records query)...")
    res = intbitset()
    if params['field'] or params['collection'] or params['pattern']:

        if not params['collection']:
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=params['pattern'],
                                 f=params['field'],
                                 m=params['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(perform_request_search(req=None,
                                                   of='id',
                                                   c=params['collection'],
                                                   p=params['pattern'],
                                                   f=params['field']))
    return res
Exemplo n.º 6
0
    def related_records(recids, recids_processed):
        if fmt == "HDREF" and recids:
            # HDREF represents the references tab
            # the tab needs to be recomputed not only when the record changes
            # but also when one of the citations changes
            sql = """SELECT id, modification_date FROM bibrec
                     WHERE id in (%s)""" % ','.join(str(r) for r in recids)

            def check_date(mod_date):
                return mod_date.strftime(
                    "%Y-%m-%d %H:%M:%S") < latest_bibrank_run

            rel_recids = intbitset([
                recid for recid, mod_date in run_sql(sql)
                if check_date(mod_date)
            ])
            for r in rel_recids:
                recids |= intbitset(get_cited_by(r))

        # To not process recids twice
        recids -= recids_processed
        # Adds to the set of processed recids
        recids_processed += recids

        return recids
def get_normalized_ranking_scores(response, hitset_filter=None, recids=[]):
    """
    Returns the result having normalized ranking scores, interval [0, 100].
    hitset_filter - optional filter for the results
    recids - optional recids that shall remain in the result despite the filter
    """
    if not len(response.results):
        return ([], intbitset())

    # response.maxScore does not work in case of something was added to the response
    max_score = float(response.results[0]['score'])
    ranked_result = []
    matched_recs = intbitset()

    for hit in response.results:
        recid = int(hit['id'])

        if (not hitset_filter and hitset_filter != []
            ) or recid in hitset_filter or recid in recids:
            normalised_score = 0
            if max_score > 0:
                normalised_score = int(100.0 / max_score * float(hit['score']))
            ranked_result.append((recid, normalised_score))
            matched_recs.add(recid)

    ranked_result.reverse()

    return (ranked_result, matched_recs)
Exemplo n.º 8
0
def get_normalized_ranking_scores(response, hitset_filter = None, recids = []):
    """
    Returns the result having normalized ranking scores, interval [0, 100].
    hitset_filter - optional filter for the results
    recids - optional recids that shall remain in the result despite the filter
    """
    if not len(response.results):
        return ([], intbitset())

    # response.maxScore does not work in case of something was added to the response
    max_score = float(response.results[0]['score'])
    ranked_result = []
    matched_recs = intbitset()

    for hit in response.results:
        recid = int(hit['id'])

        if (not hitset_filter and hitset_filter != []) or recid in hitset_filter or recid in recids:
            normalised_score = 0
            if max_score > 0:
                normalised_score = int(100.0 / max_score * float(hit['score']))
            ranked_result.append((recid, normalised_score))
            matched_recs.add(recid)

    ranked_result.reverse()

    return (ranked_result, matched_recs)
Exemplo n.º 9
0
def search_unit(query, f, m, wl=None):
    """Search for similar records."""
    from invenio.legacy.search_engine import record_exists
    from invenio.legacy.bibrank.record_sorter import METHODS
    from invenio.legacy.bibrank.word_searcher import find_similar

    results = intbitset([])

    if query:
        if isinstance(query, intbitset):
            ahitset = query
        else:
            recid = int(query)
            ahitset = [recid] if record_exists(recid) == 1 else []

        if len(ahitset):
            for recid in ahitset:
                results |= intbitset(
                    find_similar('jif',
                                 recid,
                                 intbitset([]),
                                 rank_limit_relevance=0,
                                 verbose=0,
                                 methods=METHODS)[0])

    return results
Exemplo n.º 10
0
def get_citedby_hitset(ahitset, record_limit=None):
    """
    Return a hitset of records that are cited by records in the given
    ahitset. Useful for search engine's citedby:author:ellis feature.

    The parameter 'record_limit' is the maximum number of records of 'ahitset'
    to consider. If it is None (the default value) all the records will be
    used.
    """
    out = intbitset()
    if ahitset:
        try:
            iter(ahitset)
        except OverflowError:
            # ignore attempt to iterate over infinite ahitset
            pass
        else:
            # We don't want to overwrite the input parameter
            if record_limit is not None:
                limited_ahitset = ahitset[:record_limit]
            else:
                limited_ahitset = ahitset

            in_sql = ','.join('%s' for dummy in limited_ahitset)
            rows = run_sql("""SELECT citee FROM rnkCITATIONDICT
                              WHERE citer IN (%s)""" % in_sql, limited_ahitset)
            out = intbitset(rows)
    return out
Exemplo n.º 11
0
    def test_compare_sets_tids_sets_match_with_more_than_min_and_low_len(self):
        thresholds = Thresholds(high_len=3, low_len=1, length=4, min_high=2, small=False, min_len=2)
        qlow, qhigh = intbitset(), intbitset([3, 4, 6])
        ilow, ihigh = intbitset([1]), intbitset([3, 4, 6])

        candidate = match_set.compare_sets(qhigh, qlow, ihigh, ilow, thresholds, match_set.tids_sets_intersector, match_set.tids_set_counter)
        assert candidate
Exemplo n.º 12
0
def browse_pattern_phrases(req, colls, p, f, rg, ln=CFG_SITE_LANG):
    """Returns either biliographic phrases or words indexes."""

    ## is p enclosed in quotes? (coming from exact search)
    if p.startswith('"') and p.endswith('"'):
        p = p[1:-1]

    ## okay, "real browse" follows:
    ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test

    if not f and p.find(":") > 0:  # does 'p' contain ':'?
        f, p = p.split(":", 1)

    coll_hitset = intbitset()
    for coll_name in colls:
        coll_hitset |= get_collection_reclist(coll_name)

    index_id = get_index_id_from_field(f)
    if index_id != 0:
        browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection(
            p, index_id, rg / 2, rg / 2, coll_hitset)
    else:
        browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1,
                                                      (rg - 1) / 2 + 1)
        while not browsed_phrases:
            # try again and again with shorter and shorter pattern:
            try:
                p = p[:-1]
                browsed_phrases = get_nearest_terms_in_bibxxx(
                    p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1)
            except:
                register_exception(req=req, alert_admin=True)
                # probably there are no hits at all:
                return []

        ## try to check hits in these particular collection selection:
        browsed_phrases_in_colls = []
        if 0:
            for phrase in browsed_phrases:
                phrase_hitset = intbitset()
                phrase_hitsets = search_pattern("", phrase, f, 'e')
                for coll in colls:
                    phrase_hitset.union_update(phrase_hitsets[coll])
                if len(phrase_hitset) > 0:
                    # okay, this phrase has some hits in colls, so add it:
                    browsed_phrases_in_colls.append(
                        [phrase, len(phrase_hitset)])

        ## were there hits in collections?
        if browsed_phrases_in_colls == []:
            if browsed_phrases != []:
                #write_warning(req, """<p>No match close to <em>%s</em> found in given collections.
                #Please try different term.<p>Displaying matches in any collection...""" % p_orig)
                ## try to get nbhits for these phrases in any collection:
                for phrase in browsed_phrases:
                    nbhits = get_nbhits_in_bibxxx(phrase, f, coll_hitset)
                    if nbhits > 0:
                        browsed_phrases_in_colls.append([phrase, nbhits])

    return browsed_phrases_in_colls
Exemplo n.º 13
0
def get_recids_for_set_spec(set_spec):
    """
    Returns the list (as intbitset) of recids belonging to 'set'

    Parameters:

      set_spec - *str* the set_spec for which we would like to get the
                 recids
    """
    recids = intbitset()

    for set_def in get_set_definitions(set_spec):
        new_recids = perform_request_search(c=[coll.strip() \
                                               for coll in set_def['c'].split(',')],
                                            p1=set_def['p1'],
                                            f1=set_def['f1'],
                                            m1=set_def['m1'],
                                            op1=set_def['op1'],
                                            p2=set_def['p2'],
                                            f2=set_def['f2'],
                                            m2=set_def['m2'],
                                            op2=set_def['op2'],
                                            p3=set_def['p3'],
                                            f3=set_def['f3'],
                                            m3=set_def['m3'],
                                            ap=0)

        recids |= intbitset(new_recids)

    return recids
Exemplo n.º 14
0
def burmeister(filename) :
	with file( filename, "r" ) as f:
# File format marker
		txt = f.readline().strip();
		if txt is not "B":
			raise Exception("Bad file format")
# The name of the context
		name = f.readline().strip();
# Read the volume of the extent and the intent
		G = int(f.readline().strip());
		M = int(f.readline().strip());
		skip_empty( f )
# Read the labels first the extent, last the intent
		extent = [ f.readline().strip() for x in range( G ) ]
		intent = [ f.readline().strip() for x in range( M ) ]
		skip_empty( f )
# Load and process the object-attribute relationship,
#   expect that it is stored extent-wise
		ctx0 = []
		for g in range( G ):
			line = f.readline().rstrip()
			ctx0.append(bs.intbitset( [
				m for m in range( M ) if line[ m ] in "X" ], M ) )
# Transpose the context
		ctxt = [ bs.intbitset( [ g for g in range( G )
			if m in ctx0[ g ] ], G ) for m in range( M ) ]
		return Context( extent, intent, ( ctx0, ctxt ), name )
Exemplo n.º 15
0
    def test_get_ranked_larger_hitset(self):
        """solrutils - ranking larger hitset"""
        hitset = intbitset.intbitset([47, 56, 58, 68, 85, 89])
        self.assertEqual(tuple(), self._get_ranked_result_sequence(query='Willnotfind', hitset=hitset))

        hitset = intbitset.intbitset([47, 56, 55, 56, 58, 68, 85, 89])
        self.assertEqual((55, 56), self._get_ranked_result_sequence(query='"higgs boson"', hitset=hitset))
Exemplo n.º 16
0
def filter_out_based_on_date_range(recids, fromdate="", untildate="", set_spec=None):
    """ Filter out recids based on date range."""
    if fromdate:
        fromdate = normalize_date(fromdate, "T00:00:00Z")
    else:
        fromdate = get_earliest_datestamp()
    fromdate = utc_to_localtime(fromdate)

    if untildate:
        untildate = normalize_date(untildate, "T23:59:59Z")
    else:
        untildate = get_latest_datestamp()
    untildate = utc_to_localtime(untildate)

    if set_spec is not None: ## either it has a value or it empty, thus meaning all records
        last_updated = get_set_last_update(set_spec)
        if last_updated is not None:
            last_updated = utc_to_localtime(last_updated)
            if last_updated > fromdate:
                fromdate = utc_to_localtime(get_earliest_datestamp())

    recids = intbitset(recids) ## Let's clone :-)

    if fromdate and untildate:
        recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date BETWEEN %s AND %s", (fromdate, untildate)))
    elif fromdate:
        recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date >= %s", (fromdate, )))
    elif untildate:
        recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date <= %s", (untildate, )))

    if cfg.get('CFG_OAI_FILTER_RESTRICTED_RECORDS', True):
        recids = recids - get_all_restricted_recids()

    return recids
Exemplo n.º 17
0
def get_recids_for_set_spec(set_spec):
    """
    Returns the list (as intbitset) of recids belonging to 'set'

    Parameters:

      set_spec - *str* the set_spec for which we would like to get the
                 recids
    """
    recids = intbitset()

    for set_def in get_set_definitions(set_spec):
        new_recids = perform_request_search(c=[coll.strip() \
                                               for coll in set_def['c'].split(',')],
                                            p1=set_def['p1'],
                                            f1=set_def['f1'],
                                            m1=set_def['m1'],
                                            op1=set_def['op1'],
                                            p2=set_def['p2'],
                                            f2=set_def['f2'],
                                            m2=set_def['m2'],
                                            op2=set_def['op2'],
                                            p3=set_def['p3'],
                                            f3=set_def['f3'],
                                            m3=set_def['m3'],
                                            ap=0)

        recids |= intbitset(new_recids)

    return recids
Exemplo n.º 18
0
def get_recids_for_set_spec(set_spec):
    """
    Returns the list (as intbitset) of recids belonging to 'set'

    Parameters:

      set_spec - *str* the set_spec for which we would like to get the
                 recids
    """
    recids = intbitset()

    for set_def in get_set_definitions(set_spec):
        new_recids = perform_request_search(
            c=[coll.strip() for coll in set_def["c"].split(",")],
            p1=set_def["p1"],
            f1=set_def["f1"],
            m1=set_def["m1"],
            op1=set_def["op1"],
            p2=set_def["p2"],
            f2=set_def["f2"],
            m2=set_def["m2"],
            op2=set_def["op2"],
            p3=set_def["p3"],
            f3=set_def["f3"],
            m3=set_def["m3"],
            ap=0,
        )

        recids |= intbitset(new_recids)

    return recids
Exemplo n.º 19
0
def get_data_for_definition_marc(tags, recids):
    '''Having a list of tags and a list of recids, it returns a dictionary
    with the values correspondig to the tags'''
    #x = all_recids; [get_fieldvalues(recid, '037__a') for recid in x]
    #user: 140s, sys: 21s, total: 160s - cdsdev
    if isinstance(recids, (int, long)):
        recids = intbitset([recids, ])
    # for each recid we need only one value
    #on which we sort, so we can stop looking for a value
    # as soon as we find one
    tag_index = 0
    field_data_dict = {}
    while len(recids) > 0 and tag_index < len(tags):
        write_message('%s records queried for values for tags %s.' \
                      %(len(recids), tags), verbose=5)
        res = _get_values_from_marc_tag(tags[tag_index], recids)
        res_dict = dict(res)
        #field_data_dict.update(res_dict)
        #we can not use this, because res_dict might contain recids
        #that are already in field_data_dict, and we should not overwrite their value
        field_data_dict = dict(res_dict, **field_data_dict)
        #there might be keys that we do not want (ex: using 'between')
        #so we should remove them
        res_dict_keys = intbitset(res_dict.keys())
        recids_not_needed = res_dict_keys.difference(recids)
        for recid in recids_not_needed:
            del field_data_dict[recid]
        #update the recids to contain only the recid that do not have values yet
        recids.difference_update(res_dict_keys)
        tag_index += 1
    return field_data_dict
Exemplo n.º 20
0
    def test_calc_mean_onbit_density(self):
        bitsets = {"a": intbitset([1, 2, 3]), "b": intbitset([1, 2, 4, 5, 8]), "c": intbitset([1, 2, 4, 8])}

        result = modifiedtanimoto.calc_mean_onbit_density(bitsets.values(), self.number_of_bits)

        expected = 0.04
        eq_(result, expected)
Exemplo n.º 21
0
def remove_member_from_node(G, node, member):

    # add in replacement edges if required
    mem_edges = list(
        set([e[1] for e in G.edges(node) if member in G.edges[e]['members']]))
    if len(mem_edges) > 1:
        for n1, n2 in itertools.combinations(mem_edges, 2):
            if G.has_edge(n1, n2):
                G[n1][n2]['members'] |= intbitset([member])
                G[n1][n2]['size'] = len(G[n1][n2]['members'])
            else:
                G.add_edge(n1, n2, size=1, members=intbitset([member]))

    # remove member from node
    G.nodes[node]['members'].discard(member)
    G.nodes[node]['seqIDs'] = set([
        sid for sid in G.nodes[node]['seqIDs']
        if sid.split("_")[0] != str(member)
    ])
    G.nodes[node]['size'] -= 1

    # remove member from edges of node
    edges_to_remove = []
    for e in G.edges(node):
        if member in G.edges[e]['members']:
            if len(G.edges[e]['members']) == 1:
                edges_to_remove.append(e)
            else:
                G.edges[e]['members'].discard(member)
                G.edges[e]['size'] = len(G.edges[e]['members'])
    for e in edges_to_remove:
        G.remove_edge(*e)

    return G
Exemplo n.º 22
0
def index_token_sets(token_ids, len_junk, len_good):
    """
    Return a 4-tuple of low & high tids sets, low & high tids multisets given a
    token_ids sequence.
    """
    # For multisets, we use a defaultdict, rather than a Counter. This is midly
    # faster than a Counter for sparse sets.

    # this variant uses intbitset to evaluate its performance wrt to bitarray

    low_tids_set = intbitset(len_junk)
    low_tids_set_add = low_tids_set.add
    high_tids_set = intbitset(len_good)
    high_tids_set_add = high_tids_set.add
    low_tids_mset = defaultdict(int)
    high_tids_mset = defaultdict(int)
    for tid in token_ids:
        # this skips unknown token ids that are -1 as well as possible None
        if tid < 0:
            continue
        if tid < len_junk:
            low_tids_mset[tid] += 1
            low_tids_set_add(tid)
        else:
            high_tids_mset[tid] += 1
            high_tids_set_add(tid)

    # sparify for speed
    sparsify(low_tids_mset)
    sparsify(high_tids_mset)
    return low_tids_set, high_tids_set, low_tids_mset, high_tids_mset
Exemplo n.º 23
0
def index_token_sets(token_ids, len_junk, len_good):
    """
    Return a 4-tuple of low & high tids sets, low & high tids multisets given a
    token_ids sequence.
    """
    # For multisets, we use a defaultdict, rather than a Counter. This is midly
    # faster than a Counter for sparse sets.

    # this variant uses intbitset to evaluate its performance wrt to bitarray

    low_tids_set = intbitset(len_junk)
    low_tids_set_add = low_tids_set.add
    high_tids_set = intbitset(len_good)
    high_tids_set_add = high_tids_set.add
    low_tids_mset = defaultdict(int)
    high_tids_mset = defaultdict(int)
    for tid in token_ids:
        # this skips unknown token ids that are -1 as well as possible None
        if tid < 0:
            continue
        if tid < len_junk:
            low_tids_mset[tid] += 1
            low_tids_set_add(tid)
        else:
            high_tids_mset[tid] += 1
            high_tids_set_add(tid)

    # sparify for speed
    sparsify(low_tids_mset)
    sparsify(high_tids_mset)
    return low_tids_set, high_tids_set, low_tids_mset, high_tids_mset
Exemplo n.º 24
0
def get_records_with_num_cites(numstr, allrecs = intbitset([])):
    """Return an intbitset of record IDs that are cited X times,
       X defined in numstr.
       Warning: numstr is string and may not be numeric! It can
       be 10,0->100 etc
    """
    cache_cited_by_dictionary = get_citation_dict("citationdict")
    cache_cited_by_dictionary_keys = get_citation_dict("citationdict_keys")
    cache_cited_by_dictionary_keys_intbitset = get_citation_dict("citationdict_keys_intbitset")
    matches = intbitset([])
    #once again, check that the parameter is a string
    if not (type(numstr) == type("thisisastring")):
        return intbitset([])
    numstr = numstr.replace(" ",'')
    numstr = numstr.replace('"','')

    num = 0
    #first, check if numstr is just a number
    singlenum = re.findall("(^\d+$)", numstr)
    if singlenum:
        num = int(singlenum[0])
        if num == 0:
            #we return recids that are not in keys
            return allrecs - cache_cited_by_dictionary_keys_intbitset
        for k in cache_cited_by_dictionary_keys:
            li = cache_cited_by_dictionary[k]
            if len(li) == num:
                matches.add(k)
        return matches

    #try to get 1->10 or such
    firstsec = re.findall("(\d+)->(\d+)", numstr)
    if firstsec:
        first = 0
        sec = -1
        try:
            first = int(firstsec[0][0])
            sec = int(firstsec[0][1])
        except:
            return intbitset([])
        if (first == 0):
            #start with those that have no cites..
            matches = allrecs - cache_cited_by_dictionary_keys_intbitset
        if (first <= sec):
            for k in cache_cited_by_dictionary_keys:
                li = cache_cited_by_dictionary[k]
                if len(li) >= first:
                    if len(li) <= sec:
                        matches.add(k)
            return matches

    firstsec = re.findall("(\d+)\+", numstr)
    if firstsec:
        first = firstsec[0]
        for k in cache_cited_by_dictionary_keys:
            li = cache_cited_by_dictionary[k]
            if len(li) > int(first):
                matches.add(k)
    return matches
Exemplo n.º 25
0
    def tmpl_papers_box(self, req, pubs, bibauthorid_data, num_downloads, ln, add_box=True, loading=False):
        _ = gettext_set_language(ln)
        if not loading and pubs:
            ib_pubs = intbitset(pubs)
            if bibauthorid_data["cid"]:
                baid_query = 'exactauthor:%s' % wrap_author_name_in_quotes_if_needed(bibauthorid_data["cid"])
            elif bibauthorid_data["pid"] > -1:
                baid_query = 'exactauthor:%s' % wrap_author_name_in_quotes_if_needed(bibauthorid_data["pid"])
            baid_query = baid_query + " "

            rec_query = baid_query
            searchstr = create_html_link(websearch_templates.build_search_url(p=rec_query),
                                         {}, "<strong>" + "All papers (" + str(len(pubs)) + ")" + "</strong>",)

            line2 = searchstr

            if CFG_BIBRANK_SHOW_DOWNLOAD_STATS and num_downloads:
                line2 += " (" + _("downloaded") + " "
                line2 += str(num_downloads) + " " + _("times") + ")"

            if CFG_INSPIRE_SITE:
                CFG_COLLS = ['Book',
                             'ConferencePaper',
                             'Introductory',
                             'Lectures',
                             'Preprint',
                             'Published',
                             'Review',
                             'Thesis']
            else:
                CFG_COLLS = ['Article',
                             'Book',
                             'Preprint', ]
            collsd = {}
            for coll in CFG_COLLS:
                coll_papers = list(ib_pubs & intbitset(perform_request_search(rg=0, f="collection", p=coll)))
                if coll_papers:
                    collsd[coll] = coll_papers
            colls = collsd.keys()
            colls.sort(lambda x, y: cmp(len(collsd[y]), len(collsd[x]))) # sort by number of papers
            for coll in colls:
                rec_query = baid_query + 'collection:' + wrap_author_name_in_quotes_if_needed(coll)
                line2 += "<br />" + create_html_link(websearch_templates.build_search_url(p=rec_query),
                                                                           {}, coll + " (" + str(len(collsd[coll])) + ")",)

        elif not pubs and not loading:
            line2 = _("No Papers")

        elif loading:
            line2 = self.loading_html()

        else:
            line2 = 'This is a bug and should be corrected'

        if not add_box:
            return line2
        line1 = "<strong>" + _("Papers") + "</strong>"
        papers_box = self.tmpl_print_searchresultbox("papers", line1, line2)
        return papers_box
Exemplo n.º 26
0
def get_records_that_can_be_displayed(permitted_restricted_collections,
                                      hitset_in_any_collection,
                                      current_coll=None,
                                      colls=None):
    """Return records that can be displayed."""
    current_coll = current_coll or cfg['CFG_SITE_NAME']
    records_that_can_be_displayed = intbitset()

    if colls is None:
        colls = [current_coll]

    policy = cfg['CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY'].strip().upper()

    # real & virtual
    current_coll_children = get_collection_allchildren(current_coll)

    # Add all restricted collections, that the user has access to, and are
    # under the current collection do not use set here, in order to maintain a
    # specific order: children of 'cc' (real, virtual, restricted), rest of 'c'
    # that are  not cc's children
    colls_to_be_displayed = set([
        coll for coll in current_coll_children
        if coll in colls or coll in permitted_restricted_collections
    ])
    colls_to_be_displayed |= set(
        [coll for coll in colls if coll not in colls_to_be_displayed])

    # Get all records in applicable collections
    records_that_can_be_displayed = intbitset()
    for coll in colls_to_be_displayed:
        records_that_can_be_displayed |= get_collection_reclist(coll)

    if policy == 'ANY':
        # User needs to have access to at least one collection that restricts
        # the records. We need this to be able to remove records that are both
        # in a public and restricted collection.
        permitted_recids = intbitset()
        notpermitted_recids = intbitset()
        for collection in restricted_collection_cache.cache:
            if collection in permitted_restricted_collections:
                permitted_recids |= get_collection_reclist(collection)
            else:
                notpermitted_recids |= get_collection_reclist(collection)
        notpermitted_recids -= permitted_recids
    else:
        # User needs to have access to all collections that restrict a records.
        notpermitted_recids = intbitset()
        for collection in restricted_collection_cache.cache:
            if collection not in permitted_restricted_collections:
                notpermitted_recids |= get_collection_reclist(collection)

    # Remove records that can not be seen by user
    records_that_can_be_displayed -= notpermitted_recids

    # Intersect only if there are some matched records
    if not hitset_in_any_collection.is_infinite():
        records_that_can_be_displayed &= hitset_in_any_collection

    return records_that_can_be_displayed
Exemplo n.º 27
0
def load_graphs(graph_files, n_cpu=1):
    for graph_file in graph_files:
        if not os.path.isfile(graph_file):
            print("Missing:", graph_file)
            raise RuntimeError("Missing graph file!")

    graphs = [nx.read_gml(graph_file) for graph_file in tqdm(graph_files)]
    isolate_names = list(
        itertools.chain.from_iterable(
            [G.graph['isolateNames'] for G in graphs]))

    member_count = 0
    node_count = 0
    id_mapping = []
    for i, G in enumerate(graphs):
        id_mapping.append({})
        # relabel nodes to be consecutive integers from 1
        mapping = {}
        for n in G.nodes():
            mapping[n] = node_count
            node_count += 1
        G = nx.relabel_nodes(G, mapping, copy=True)

        # set up edge members and remove conflicts.
        for e in G.edges():
            G[e[0]][e[1]]['members'] = intbitset([
                m + member_count for m in conv_list(G[e[0]][e[1]]['members'])
            ])

        # set up node parameters and remove conflicts.
        max_mem = -1
        for n in G.nodes():
            ncentroids = []
            for sid in G.nodes[n]['centroid'].split(";"):
                nid = update_sid(sid, member_count)
                id_mapping[i][sid] = nid
                if "refound" not in nid:
                    ncentroids.append(nid)
            G.nodes[n]['centroid'] = ncentroids
            new_ids = set()
            for sid in conv_list(G.nodes[n]['seqIDs']):
                nid = update_sid(sid, member_count)
                id_mapping[i][sid] = nid
                new_ids.add(nid)
            G.nodes[n]['seqIDs'] = new_ids
            G.nodes[n]['protein'] = del_dups(G.nodes[n]['protein'].replace(
                '*', 'J').split(";"))
            G.nodes[n]['dna'] = del_dups(G.nodes[n]['dna'].split(";"))
            G.nodes[n]['lengths'] = conv_list(G.nodes[n]['lengths'])
            G.nodes[n]['longCentroidID'][1] = update_sid(
                G.nodes[n]['longCentroidID'][1], member_count)
            G.nodes[n]['members'] = intbitset(
                [m + member_count for m in conv_list(G.nodes[n]['members'])])
            max_mem = max(max_mem, max(G.nodes[n]['members']))

        member_count = max_mem + 1
        graphs[i] = G

    return graphs, isolate_names, id_mapping
Exemplo n.º 28
0
    def test_similarity(self):
        bitset1 = intbitset([1, 2, 3])
        bitset2 = intbitset([1, 2, 4, 8])

        result = modifiedtanimoto.similarity(bitset1, bitset2, self.number_of_bits, self.corr_st, self.corr_sto)

        expected = 0.5779523809525572
        assert_almost_equal(result, expected)
Exemplo n.º 29
0
    def test_get_ranked_smaller_hitset(self):
        """solrutils - ranking smaller hitset"""
        hitset = intbitset.intbitset([47, 56, 58, 68, 85, 89])
        self.assertEqual((47, 56, 58, 68, 89, 85), self._get_ranked_result_sequence(query='higgs', hitset=hitset))

        hitset = intbitset.intbitset([45, 50, 61, 74, 94])
        self.assertEqual((50, 61, 74, 45, 94), self._get_ranked_result_sequence(query='of', hitset=hitset))
        self.assertEqual((74, 45, 94), self._get_ranked_result_sequence(query='of', hitset=hitset, rows=3))
Exemplo n.º 30
0
def browse_pattern_phrases(req, colls, p, f, rg, ln=CFG_SITE_LANG):
    """Returns either biliographic phrases or words indexes."""

    ## is p enclosed in quotes? (coming from exact search)
    if p.startswith('"') and p.endswith('"'):
        p = p[1:-1]

    ## okay, "real browse" follows:
    ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test

    if not f and p.find(":") > 0:  # does 'p' contain ':'?
        f, p = p.split(":", 1)

    coll_hitset = intbitset()
    for coll_name in colls:
        coll_hitset |= get_collection_reclist(coll_name)

    index_id = get_index_id_from_field(f)
    if index_id != 0:
        browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection(
            p, index_id, rg / 2, rg / 2, coll_hitset
        )
    else:
        browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1)
        while not browsed_phrases:
            # try again and again with shorter and shorter pattern:
            try:
                p = p[:-1]
                browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1)
            except:
                register_exception(req=req, alert_admin=True)
                # probably there are no hits at all:
                return []

        ## try to check hits in these particular collection selection:
        browsed_phrases_in_colls = []
        if 0:
            for phrase in browsed_phrases:
                phrase_hitset = intbitset()
                phrase_hitsets = search_pattern("", phrase, f, "e")
                for coll in colls:
                    phrase_hitset.union_update(phrase_hitsets[coll])
                if len(phrase_hitset) > 0:
                    # okay, this phrase has some hits in colls, so add it:
                    browsed_phrases_in_colls.append([phrase, len(phrase_hitset)])

        ## were there hits in collections?
        if browsed_phrases_in_colls == []:
            if browsed_phrases != []:
                # write_warning(req, """<p>No match close to <em>%s</em> found in given collections.
                # Please try different term.<p>Displaying matches in any collection...""" % p_orig)
                ## try to get nbhits for these phrases in any collection:
                for phrase in browsed_phrases:
                    nbhits = get_nbhits_in_bibxxx(phrase, f, coll_hitset)
                    if nbhits > 0:
                        browsed_phrases_in_colls.append([phrase, nbhits])

    return browsed_phrases_in_colls
Exemplo n.º 31
0
def get_records_that_can_be_displayed(permitted_restricted_collections,
                                      hitset_in_any_collection,
                                      current_coll=None, colls=None):
    """Return records that can be displayed."""
    current_coll = current_coll or cfg['CFG_SITE_NAME']
    records_that_can_be_displayed = intbitset()

    if colls is None:
        colls = [current_coll]

    policy = cfg['CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY'].strip().upper()

    # real & virtual
    current_coll_children = get_collection_allchildren(current_coll)

    # Add all restricted collections, that the user has access to, and are
    # under the current collection do not use set here, in order to maintain a
    # specific order: children of 'cc' (real, virtual, restricted), rest of 'c'
    # that are  not cc's children
    colls_to_be_displayed = set([
        coll for coll in current_coll_children
        if coll in colls or coll in permitted_restricted_collections
    ])
    colls_to_be_displayed |= set([coll for coll in colls
                                  if coll not in colls_to_be_displayed])

    # Get all records in applicable collections
    records_that_can_be_displayed = intbitset()
    for coll in colls_to_be_displayed:
        records_that_can_be_displayed |= get_collection_reclist(coll)

    if policy == 'ANY':
        # User needs to have access to at least one collection that restricts
        # the records. We need this to be able to remove records that are both
        # in a public and restricted collection.
        permitted_recids = intbitset()
        notpermitted_recids = intbitset()
        for collection in restricted_collection_cache.cache:
            if collection in permitted_restricted_collections:
                permitted_recids |= get_collection_reclist(collection)
            else:
                notpermitted_recids |= get_collection_reclist(collection)
        notpermitted_recids -= permitted_recids
    else:
        # User needs to have access to all collections that restrict a records.
        notpermitted_recids = intbitset()
        for collection in restricted_collection_cache.cache:
            if collection not in permitted_restricted_collections:
                notpermitted_recids |= get_collection_reclist(collection)

    # Remove records that can not be seen by user
    records_that_can_be_displayed -= notpermitted_recids

    # Intersect only if there are some matched records
    if not hitset_in_any_collection.is_infinite():
        records_that_can_be_displayed &= hitset_in_any_collection

    return records_that_can_be_displayed
Exemplo n.º 32
0
def get_nbhits_in_idxphrases(word, f):
    """Return number of hits for 'word' inside phrase index for field 'f'."""
    model = IdxINDEX.idxPHRASEF(f or "anyfield")
    if model is None:
        return 0
    hitlist = intbitset()
    for item in model.query.filter_by(term=word).values('hitlist'):
        hitlist |= intbitset(item[0])
    return len(hitlist)
Exemplo n.º 33
0
 def get_records_for_user(qid, uid):
     key = get_search_results_cache_key_from_qid(qid)
     data = search_results_cache.get(key)
     if data is None:
         return intbitset([])
     cc = search_results_cache.get(key + '::cc')
     return get_records_that_can_be_displayed(
         current_user.get('precached_permitted_restricted_collections', []),
         intbitset().fastload(data), cc)
Exemplo n.º 34
0
 def __init__(self, extent, intent, ctx, name):
     super(Context, self).__init__()
     self.intent = intent
     self.extent = extent
     self.name = name
     ## Prepare the context
     self.__ctx = ctx
     self.__extent = bs.intbitset(range(len(self.extent)))
     self.__intent = bs.intbitset(range(len(self.intent)))
 def get_records_for_user(qid, uid):
     key = get_search_results_cache_key_from_qid(qid)
     data = search_results_cache.get(key)
     if data is None:
         return intbitset([])
     cc = search_results_cache.get(key + '::cc')
     return get_records_that_can_be_displayed(
         current_user.get('precached_permitted_restricted_collections', []),
         intbitset().fastload(data), cc)
Exemplo n.º 36
0
	def __init__(self, extent, intent, ctx, name):
		super(Context, self).__init__()
		self.intent = intent
		self.extent = extent
		self.name = name
## Prepare the context
		self.__ctx = ctx
		self.__extent = bs.intbitset( range( len( self.extent ) ) )
		self.__intent = bs.intbitset( range( len( self.intent ) ) )
Exemplo n.º 37
0
def get_nbhits_in_idxphrases(word, f):
    """Return number of hits for 'word' inside phrase index for field 'f'."""
    model = IdxINDEX.idxPHRASEF(f or "anyfield")
    if model is None:
        return 0
    hitlist = intbitset()
    for item in model.query.filter_by(term=word).values('hitlist'):
        hitlist |= intbitset(item[0])
    return len(hitlist)
Exemplo n.º 38
0
def single_tag_rank(config):
    """Connect the given tag with the data from the kb file given"""
    write_message("Loading knowledgebase file", verbose=9)
    kb_data = {}
    records = []

    write_message("Reading knowledgebase file: %s" % \
                   config.get(config.get("rank_method", "function"), "kb_src"))

    kb_src = config.get(config.get("rank_method", "function"), "kb_src").strip()
    # Find path from configuration registry by knowledge base name.
    kb_src_clean = configuration.get(kb_src)

    with open(kb_src_clean, 'r') as kb_file:
        data = kb_file.readlines()

    for line in data:
        if not line[0:1] == "#":
            kb_data[string.strip((string.split(string.strip(line), "---"))[0])] = (string.split(string.strip(line), "---"))[1]
    write_message("Number of lines read from knowledgebase file: %s" % len(kb_data))

    tag = config.get(config.get("rank_method", "function"), "tag")
    tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(", ")
    if tags == ['']:
        tags = ""

    records = []
    for (recids, recide) in options["recid_range"]:
        task_sleep_now_if_required(can_stop_too=True)
        write_message("......Processing records #%s-%s" % (recids, recide))
        recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide))
        valid = intbitset(trailing_bits=1)
        valid.discard(0)
        for key in tags:
            newset = intbitset()
            newset += [recid[0] for recid in (run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))]
            valid.intersection_update(newset)
        if tags:
            recs = filter(lambda x: x[0] in valid, recs)
        records = records + list(recs)
        write_message("Number of records found with the necessary tags: %s" % len(records))

    records = filter(lambda x: x[0] in options["validset"], records)
    rnkset = {}
    for key, value in records:
        if value in kb_data:
            if key not in rnkset:
                rnkset[key] = float(kb_data[value])
            else:
                if rnkset[key] in kb_data and float(kb_data[value]) > float((rnkset[key])[1]):
                    rnkset[key] = float(kb_data[value])
        else:
            rnkset[key] = 0

    write_message("Number of records available in rank method: %s" % len(rnkset))
    return rnkset
Exemplo n.º 39
0
def citation(rank_method_code, related_to, hitset, rank_limit_relevance, verbose):
    """Sort records by number of citations"""
    if related_to:
        from invenio.legacy.search_engine import search_pattern
        hits = intbitset()
        for pattern in related_to:
            hits |= hitset & intbitset(search_pattern(p='refersto:%s' % pattern))
    else:
        hits = hitset
    return rank_by_citations(hits, verbose)
Exemplo n.º 40
0
def citation(rank_method_code, related_to, hitset, rank_limit_relevance, verbose):
    """Sort records by number of citations"""
    if related_to:
        from invenio.legacy.search_engine import search_pattern
        hits = intbitset()
        for pattern in related_to:
            hits |= hitset & intbitset(search_pattern(p='refersto:%s' % pattern))
    else:
        hits = hitset
    return rank_by_citations(hits, verbose)
Exemplo n.º 41
0
 def get_records_for_user(qid, uid):
     from invenio.legacy.search_engine import \
         get_records_that_can_be_displayed
     key = get_search_results_cache_key_from_qid(qid)
     data = search_results_cache.get(key)
     if data is None:
         return intbitset([])
     cc = search_results_cache.get(key + '::cc')
     return get_records_that_can_be_displayed(current_user,
                                              intbitset().fastload(data),
                                              cc)
Exemplo n.º 42
0
 def get_records_for_user(qid, uid):
     from invenio.legacy.search_engine import \
         get_records_that_can_be_displayed
     key = get_search_results_cache_key_from_qid(qid)
     data = search_results_cache.get(key)
     if data is None:
         return intbitset([])
     cc = search_results_cache.get(key + '::cc')
     return get_records_that_can_be_displayed(current_user,
                                              intbitset().fastload(data),
                                              cc)
Exemplo n.º 43
0
 def _add_user_hard(self, username, password):
     # type: (unicode, unicode) -> User
     points = 0
     empty_buf = buffer(intbitset().fastdump())
     self.db.cursor.execute(
         'INSERT INTO users VALUES (NULL, ?, ?, ?, ?, ?)',
         [username,
          hash_password(password), points, empty_buf, empty_buf])
     user_id = self.db.cursor.lastrowid
     self.commit()
     return User(user_id, username, points, intbitset(), intbitset())
Exemplo n.º 44
0
 def test_record_sorter(self):
     """bibrank record sorter - sorting records"""
     from invenio.legacy.bibrank import word_searcher as bibrank_word_searcher
     from intbitset import intbitset
     hitset = intbitset()
     hitset += (1,2,5)
     hitset2 = intbitset()
     hitset2.add(5)
     rec_termcount = {1: 1, 2: 1, 5: 1}
     (res1, res2) = bibrank_word_searcher.sort_record_relevance({1: 50, 2:30, 3:70,4:10},rec_termcount,hitset, 50,0)
     self.assertEqual(([(1, 71), (3, 100)], list(hitset2)), (res1, list(res2)))
Exemplo n.º 45
0
    def test_similarities_ignore_upper_triangle(self):
        bitsets = {"a": intbitset([1, 2, 3]), "b": intbitset([1, 2, 4, 5, 8]), "c": intbitset([1, 2, 4, 8])}

        iterator = modifiedtanimoto.similarities(
            bitsets, bitsets, self.number_of_bits, self.corr_st, self.corr_sto, 0.55, True
        )
        result = [r for r in iterator]

        expected = [("a", "c", 0.5779523809525572), ("b", "c", 0.8357708333333689)]
        # pair a-c is below cutoff with similarity of 0.53
        assert_similarities(result, expected)
Exemplo n.º 46
0
    def modified_requested_recids(self):
        """Record IDs of records that match the filters of this task.

        This property takes (0) `requested_ids`, (1) `filter_pattern` and if
        `force_run_on_unmodified_records` is enabled (2)
        `CheckerRecord.last_run_version_id` into consideration to figure out
        which recids a record-centric task should run on.

        :rtype: intbitset
        """
        # Get all records that are already associated to this rule
        # If this is returning an empty set, you forgot to run bibindex
        try:
            associated_records = intbitset(zip(
                *db.session
                .query(CheckerRecord.rec_id)
                .filter(
                    CheckerRecord.rule_name == self.name
                ).all()
            )[0])
        except IndexError:
            associated_records = intbitset()

        # Store requested records that were until now unknown to this rule
        requested_ids = self.requested_recids
        for requested_id in requested_ids - associated_records:
            new_record = CheckerRecord(rec_id=requested_id,
                                       rule_name=self.name)
            db.session.add(new_record)
        db.session.commit()

        # Figure out which records have been edited since the last time we ran
        # this rule
        try:
            recids = zip(
                *db.session
                .query(CheckerRecord.rec_id)
                .outerjoin(RecordMetadata)
                .filter(
                    CheckerRecord.rec_id.in_(requested_ids),
                    CheckerRecord.rule_name == self.name,
                    db.or_(
                        self.force_run_on_unmodified_records,
                        db.or_(
                            CheckerRecord.last_run_version_id == 1,
                            CheckerRecord.last_run_version_id < RecordMetadata.version_id,
                        ),
                    )
                )
            )[0]
        except IndexError:
            recids = set()
        return intbitset(recids)
Exemplo n.º 47
0
 def find(self, column, value):
     if self.inverse.has_column(column):
         if hasattr(value, '__call__'):
             result = intbitset()
             for k in self.inverse.keys(column):
                 if value(k):
                     result.union_update(self.inverse.get(column, k))
             return result
         else:
             if self.inverse.has_key(column, value):
                 return self.inverse.get(column, value)
     return intbitset()
Exemplo n.º 48
0
    def __init__(self,
                 location=None,
                 query_string=None,
                 idx=None,
                 line_threshold=4,
                 _test_mode=False,
                 tokenizer=query_tokenizer):
        """
        Initialize the query from a file `location` or `query_string` string for an
        `idx` LicenseIndex.

        Break query in runs when there are at least `line_threshold` empty lines or
        junk-only lines.
        """
        assert (location or query_string) and idx

        self.location = location
        self.query_string = query_string
        self.idx = idx

        self.line_threshold = line_threshold

        # token ids array
        self.tokens = []

        # index of position -> line number where the pos is the list index
        self.line_by_pos = []

        # index of known position -> number of unknown tokens after that pos
        # for unknowns at the start, the pos is -1
        self.unknowns_by_pos = defaultdict(int)

        # Span of known positions followed by unknown token(s)
        self.unknowns_span = None

        # set of query position were there is a short, single letter token or digits-only token
        # TODO: consider using an intbitset
        self.shorts_and_digits_pos = set()

        self.query_runs = []
        if _test_mode:
            return

        self.tokenize_and_build_runs(self.tokens_by_line(tokenizer=tokenizer),
                                     line_threshold=line_threshold)

        # sets of integers initialized after query tokenization
        len_junk = idx.len_junk
        self.high_matchables = intbitset(
            [p for p, t in enumerate(self.tokens) if t >= len_junk])
        self.low_matchables = intbitset(
            [p for p, t in enumerate(self.tokens) if t < len_junk])
Exemplo n.º 49
0
def get_all_recids(including_deleted=True):  #6.68s on cdsdev
    """Returns a list of all records available in the system"""
    res = run_sql("SELECT id FROM bibrec")
    if not res:
        return intbitset([])
    all_recs = intbitset(res)
    if not including_deleted:  # we want to exclude deleted records
        if CFG_CERN_SITE:
            deleted = search_pattern(p='980__:"DELETED" OR 980__:"DUMMY"')
        else:
            deleted = search_pattern(p='980__:"DELETED"')
        all_recs.difference_update(deleted)
    return all_recs
Exemplo n.º 50
0
def get_all_recids(including_deleted=True):#6.68s on cdsdev
    """Returns a list of all records available in the system"""
    res = run_sql("SELECT id FROM bibrec")
    if not res:
        return intbitset([])
    all_recs = intbitset(res)
    if not including_deleted: # we want to exclude deleted records
        if CFG_CERN_SITE:
            deleted = search_pattern(p='980__:"DELETED" OR 980__:"DUMMY"')
        else:
            deleted = search_pattern(p='980__:"DELETED"')
        all_recs.difference_update(deleted)
    return all_recs
Exemplo n.º 51
0
def without_fmt(queries, chunk_size=2000):
    """
    List of record IDs to be reformated, not having the specified format yet

    @param sql: a dictionary with sql queries to pick from
    @return: a list of record ID without pre-created format cache
    """
    sql = queries['missing']
    recids = intbitset()
    max_id = run_sql("SELECT max(id) FROM bibrec")[0][0]
    for start in xrange(1, max_id + 1, chunk_size):
        end = start + chunk_size
        recids += intbitset(run_sql(sql, (start, end)))
    return recids
Exemplo n.º 52
0
 def test_compare_sets_tids_sets_match_with_less_than_ilow_len(self):
     thresholds = Thresholds(high_len=3,
                             low_len=1,
                             length=3,
                             min_high=2,
                             small=False,
                             min_len=2)
     qlow, qhigh = intbitset(), intbitset([3, 4, 6])
     ilow, ihigh = intbitset([1]), intbitset([3, 4, 6])
     candidate = match_set.compare_sets(qhigh, qlow, ihigh, ilow,
                                        thresholds,
                                        match_set.tids_sets_intersector,
                                        match_set.tids_set_counter)
     assert candidate
Exemplo n.º 53
0
 def merge(self, index):
     for column in index.store.columns():
         for key in index.store.keys(column):
             self.store.set(column, key, index.store.get(column, key))
     for column in index.inverse.columns():
         for key in index.inverse.keys(column):
             if not self.inverse.has_key(column, key):
                 self.inverse.set(column, key, index.inverse.get(column, key))
             else:
                 me = intbitset()
                 me.fastload(self.inverse.get(column, key))
                 other = intbitset()
                 other.fastload(index.inverse.get(column, key))
                 me.union_update(other)
                 self.inverse.set(column, key, me)
Exemplo n.º 54
0
def get_collection_reclist(coll, recreate_cache_if_needed=True):
    """Return hitset of recIDs that belong to the collection 'coll'."""
    from invenio.modules.search.searchext.engines.native import \
        search_unit_in_idxphrases

    if recreate_cache_if_needed:
        collection_reclist_cache.recreate_cache_if_needed()
    if coll not in collection_reclist_cache.cache:
        return intbitset()
    if not collection_reclist_cache.cache[coll]:
        c_coll = Collection.query.filter_by(name=coll).first()
        if c_coll:
            collection_reclist_cache.cache[coll] = search_unit_in_idxphrases(
                c_coll.name, 'collection', 'e')
    return collection_reclist_cache.cache[coll] or intbitset()
Exemplo n.º 55
0
        def fill():
            alldicts = {}
            from invenio.legacy.bibrank.tag_based_indexer import fromDB
            serialized_weights = cache.get('citations_weights')
            if serialized_weights:
                weights = deserialize_via_marshal(serialized_weights)
            else:
                weights = fromDB('citation')

            alldicts['citations_weights'] = weights
            # for cited:M->N queries, it is interesting to cache also
            # some preprocessed citationdict:
            alldicts['citations_keys'] = intbitset(weights.keys())

            # Citation counts
            alldicts['citations_counts'] = [t for t in iteritems(weights)]
            alldicts['citations_counts'].sort(key=itemgetter(1), reverse=True)

            # Self-cites
            serialized_weights = cache.get('selfcites_weights')
            if serialized_weights:
                selfcites = deserialize_via_marshal(serialized_weights)
            else:
                selfcites = fromDB('selfcites')
            selfcites_weights = {}
            for recid, counts in alldicts['citations_counts']:
                selfcites_weights[recid] = counts - selfcites.get(recid, 0)
            alldicts['selfcites_weights'] = selfcites_weights
            alldicts['selfcites_counts'] = [(recid, selfcites_weights.get(recid, cites)) for recid, cites in alldicts['citations_counts']]
            alldicts['selfcites_counts'].sort(key=itemgetter(1), reverse=True)

            return alldicts
Exemplo n.º 56
0
def search_unit(query, f, m, wl=None):
    """Search in fulltext."""
    from invenio.legacy.search_engine import (search_unit_in_bibwords,
                                              search_pattern)
    from invenio.legacy.miscutil.solrutils_bibindex_searcher import (
        solr_get_bitset)
    from invenio.legacy.miscutil.xapianutils_bibindex_searcher import (
        xapian_get_bitset)
    from ...utils import get_idx_indexer

    def fix(p):
        if m and (m == 'a' or m == 'r'):  # phrase/regexp query
            if p.startswith('%') and p.endswith('%'):
                p = p[1:-1]  # fix for partial phrase
            p = '"' + p + '"'
        return p

    indexers = {
        'SOLR': solr_get_bitset,
        'XAPIAN': xapian_get_bitset,
    }
    indexer = get_idx_indexer('fulltext')
    if indexer in indexers and \
            current_app.config.get('CFG_{}_ENABLED'.format(indexer), False):
        try:
            indexers[indexer](fix(query), f, m)
        except:
            current_app.logger.exception("Fulltext search is broken.")
            return intbitset()
    elif m == 'a' or m == 'r':
        # FIXME: workaround for not having phrase index yet
        return search_pattern(p=query, f=f, m='w')
    # FIXME raise ContinueSearch(query, f, m, wl)
    return search_unit_in_bibwords(query, f, wl=wl)
Exemplo n.º 57
0
 def all_recids(self):
     """Get all recids that are assumed to exist by tasks of this master."""
     identifier = self.fmt(master_all_recids)
     recids_set = self.conn.get(identifier)
     if recids_set is None:
         return None
     return intbitset(recids_set)