예제 #1
0
def recommend_by_url(url):
    parsed = urlparse(url)
    doc = Document(requests.get(url).content)
    content = html.fromstring(doc.content()).xpath('string()')
    bigrams = make_bigrams(content)
    vec_bow = dictionary.doc2bow(bigrams)
    vec_lsi = lsi[vec_bow]
    sims = index[vec_lsi]
    #print sims
    docs = sorted(list(enumerate(sims)), key=lambda item: -item[1])
    results, seen = [], []
    for doc, score in docs:
        res = ARTICLES[doc]
        if not 'url' in res or res['url'] in seen:
            continue
        seen.append(res['url'])
        p = urlparse(res['url'])
        if p.hostname.endswith(parsed.hostname):
            continue
        res['score'] = float(score)
        if 'content' in res:
            del res['content']
        if 'html' in res:
            del res['html']
        if res['summary']:
            res['summary'] = res['summary'].strip()
        results.append(res)
        if len(results) > 14:
            break
    return results
예제 #2
0
def generate_texts():
    with open('texts.json', 'wb') as fh:
        texts = []
        for i, article in enumerate(table):
            article['bigrams'] = make_bigrams(article.get('content', ''))
            print [i, len(article['bigrams'])]
            texts.append(article['bigrams'] + [article.get('url')])
        json.dump(texts, fh)
        return texts