def _get_data(args): key_from = 'news:{}:'.format(args.date_from.replace('-', '')) key_to = 'news:{};'.format(args.date_to.replace('-', '')) db = datastore.corpus_db() for key, doc in db.range(key_from, key_to): if not any(uri in doc['url'] for uri in DISCARD_URLS): yield doc
def main(args): # search corpus index db = datastore.corpus_db() sconn = xappy.SearchConnection(settings.XAPIAN_DB) query = ' '.join(args.query) print "Search {} documents for '{}'".format( sconn.get_doccount(), args.query ) q = sconn.query_parse(query, default_op=sconn.OP_AND) if args.category: qc = q.compose(q.OP_OR, [ sconn.query_field('category', c) for c in args.category ]) q = q & qc if args.date: qd = q.compose(q.OP_OR, [ sconn.query_field('date', d) for d in args.date ]) q = q & qd if args.date_start and args.date_end: qr = sconn.query_range('date', args.date_start, args.date_end) q = q.filter(qr) if args.sort: sortby = [tuple(args.sort.split(','))] else: sortby = None print 'Query: {!r}'.format(q) results = execute_query(sconn, q, args.offset, args.limit, getfacets=args.facet, allowfacets=('category',), sortby=sortby) if results.estimate_is_exact: print "Found {} results".format(results.matches_estimated) else: print "Found approximately {} results".format(results.matches_estimated) for i, result in enumerate(results, 1): doc = db[result.id] try: cat = result.get_terms('category').next() except StopIteration: cat = 'none' try: date = result.get_terms('date').next() except StopIteration: date = 'none' print "{:2}. {} -- {} -- {}\n\t{}\n\t{}\n".format( i, cat, doc['headline'], date, doc['url'], result.id) from IPython import embed; embed()
def load_jsonlines(stream): # do batch write db = datastore.corpus_db() with db.write_batch() as wb: for n, line in enumerate(stream, 1): doc = preprocess(json.loads(line)) key = get_key(doc) wb[key] = doc return n
def main(args): # index corpus db = datastore.corpus_db() if args.prefix: items = db.range(args.prefix, next_key(args.prefix)) elif args.from_classified: items = get_classified_items(args.from_classified, db) else: items = db.range() count = search.index(items, 'corpus', create=args.create) print "Indexed {} documents".format(count)
def _get_data(category): # load data from xapian categories db = datastore.corpus_db() sconn = xappy.SearchConnection(settings.XAPIAN_DB) q = sconn.query_field('category', category) offset = 0 limit = 1000 while True: results = sconn.search(q, offset, offset + limit) for key, doc in db.range(key_from, key_to): if not any(uri in doc['url'] for uri in DISCARD_URLS): yield doc
def load_csv(stream): # read first row as fields fields = csv.reader(stream).next() if any(f not in fields for f in REQUIRED_FIELDS): raise ValueError( "Required fields: {}".format(','.join(REQUIRED_FIELDS)) ) reader = csv.DictReader(stream, fields) # do batch write db = datastore.corpus_db() with db.write_batch() as wb: for n, doc in enumerate(reader, 1): doc = preprocess(dict( (k, v.decode('utf-8')) for k, v in doc.iteritems() )) key = get_key(doc) wb[key] = doc return n
def load_keys(fromkey, offset, limit): tokey = next_key(fromkey) db = datastore.corpus_db() it = db.range(fromkey, tokey) return [v for k, v in itertools.islice(it, offset, offset + limit)]
def load_keys(fromkey, offset, limit): tokey = next_key(fromkey) db = datastore.corpus_db() it = db.range(fromkey, tokey) return [v for k,v in itertools.islice(it, offset, offset + limit)]
def main(args): # search corpus index db = datastore.corpus_db() sconn = xappy.SearchConnection(settings.XAPIAN_DB) query = ' '.join(args.query) print "Search {} documents for '{}'".format(sconn.get_doccount(), args.query) q = sconn.query_parse(query, default_op=sconn.OP_AND) if args.category: qc = q.compose( q.OP_OR, [sconn.query_field('category', c) for c in args.category]) q = q & qc if args.date: qd = q.compose(q.OP_OR, [sconn.query_field('date', d) for d in args.date]) q = q & qd if args.date_start and args.date_end: qr = sconn.query_range('date', args.date_start, args.date_end) q = q.filter(qr) if args.sort: sortby = [tuple(args.sort.split(','))] else: sortby = None print 'Query: {!r}'.format(q) results = execute_query(sconn, q, args.offset, args.limit, getfacets=args.facet, allowfacets=('category', ), sortby=sortby) if results.estimate_is_exact: print "Found {} results".format(results.matches_estimated) else: print "Found approximately {} results".format( results.matches_estimated) for i, result in enumerate(results, 1): doc = db[result.id] try: cat = result.get_terms('category').next() except StopIteration: cat = 'none' try: date = result.get_terms('date').next() except StopIteration: date = 'none' print "{:2}. {} -- {} -- {}\n\t{}\n\t{}\n".format( i, cat, doc['headline'], date, doc['url'], result.id) from IPython import embed embed()