Exemplo n.º 1
0
def generate_concordances_machinetags(concordances):

    machinetags = []

    for c in concordances:
        mt = machinetag.machinetag(c)
        machinetags.extend(mt.magic_8s())

    return machinetags
Exemplo n.º 2
0
def generate_concordances_machinetags_hierarchy(concordances):

    hierarchies = []

    for c in concordances:
        mt = machinetag.machinetag(c)
        c = "%s/%s/%s" % (mt.namespace(), mt.predicate(), mt.value())

        hierarchies.append(c)

    return hierarchies
Exemplo n.º 3
0
def import_links(options):

    solr = pysolr.Solr(options.solr)

    if options.purge:
        logging.info("purging all existing bookmarks...")
        solr.delete(q='*:*')

    fh = open(options.pinboard, 'r')

    data = json.load(fh)
    docs = []

    for doc in data:

        tags = []
        machinetags = []
        machinetags_hierarchy = []

        for t in doc['tags'].split(' '):

            tags.append(t)

            mt = machinetag.machinetag(t)

            if not mt.is_machinetag():
                continue

            for chunk in mt.magic_8s():
                if not chunk in machinetags:
                    machinetags.append(chunk)

            hier = [
                mt.namespace(),
                mt.predicate(),
                mt.value()
                ]

            hier = map(unicode, hier)
            hier = "/".join(hier)

            machinetags_hierarchy.append(hier)

        if len(tags):
            doc['tags'] = tags

        if len(machinetags):
            doc['machinetags'] = machinetags
            doc['machinetags_hierarchy'] = machinetags_hierarchy

        for key in ('shared', 'toread'):
            if doc[ key ] == 'yes':
                doc[ key ] = True
            else:
                doc[ key ] = False

        if doc['description'] == '':
            doc['description'] = doc['href']

        parsed = urlparse.urlparse(doc['href'])
        hostname = parsed.hostname

        if hostname:
            if hostname.startswith("www."):
                hostname = hostname.replace("www.", "")
        
        doc['hostname'] = hostname

        docs.append(doc)
        
        if len(docs) == 1000:
            solr.add(docs)
            docs = []

    if len(docs):
        solr.add(docs)
    
    logging.debug("import complete, optimizing...")
    solr.optimize()