示例#1
0
def content_scraper(table):

    docs = table_to_list(table)
    for i,doc in enumerate(docs):
        if 'content' in doc.keys():
            continue
        thread = threading.Thread(name=i, target=content_adder_thread, args=(table, doc, i))
        thread.start()
        time.sleep(np.random.random()/3+0.3)
示例#2
0
    def __init__(self):
        self.i = 0
        self.table = st.open_database_collection('nyt')

        docs = st.table_to_list(self.table)
        self.seen_urls = {doc['web_url'] for doc in docs}

        self.link = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'
        NYT_API_KEY = os.environ['NYT_API_KEY']
        self.payload = {'api-key': NYT_API_KEY}
        self._set_filters()
def remove_dups(table):
    #ipdb.set_trace()
    docs = st.table_to_list(table)

    # urls = [ doc['link'] for doc in docs]
    # _ids = [ doc['_id'] for doc in docs]

    if 'web_url' in docs[0].keys():
        for i, _ in enumerate(docs):
            docs[i]['link'] = docs[i]['web_url']

    pairs = [(doc['link'], doc['_id']) for doc in docs]
    pair_dict = dict(pairs)
    id_keepers = set(pair_dict.values())
    id_all = {doc['_id'] for doc in docs}

    kill_ids = id_all.difference(id_keepers)

    for _id in kill_ids:
        table.delete_one(filter={'_id': _id})