def main(): accounts = config.conf['instagram']['accounts'] client, db, collection = mongo.mongo_connect('instagramm', 'media') for account in accounts: data = get_urls(account) for date in data: insta_url = data[date].pop() img_url = data[date].pop() id = mongo.ObjectId(str(insta_url[-1:-13:-1])) if mongo.check_id(id, collection): post = {"_id": id, "author": account, "date": date, "img_url": img_url, "insta_url": insta_url} collection.insert_one(post) media = requests.get(img_url, stream=True) if media.status_code == 200: path = '/tmp/' + date + '.jpeg' with open(path, 'wb') as f: for chunk in media.iter_content(1024): f.write(chunk) # telegramm_bot.send_message(path, type='photo') telegramm_bot.send_message(insta_url, type='text')
def query(): """fill in the listings in mongo with null neighborhoods! """ coll = mongo_connect(MONGO_USER, MONGO_PW) count = coll.find({"neighborhood": "Unknown"}).count() print count
def fill_null_neighborhoods(): """fill in the listings in mongo with null neighborhoods! """ coll = mongo_connect(MONGO_USER, MONGO_PW) listings = coll.find({ "neighborhood": None }, { '_id': 1, 'url': 1 }).sort("_id", -1) # print len(listings), "listings to fix" browser = webdriver.Chrome( executable_path="/Users/davidhey/Documents/chromedriver") browser.set_window_position(0, 0) browser.set_window_size(400, 1000) for listing in listings: pprint.pprint(listing) browser.get(listing['url']) try: wait = WebDriverWait(browser, 30) wait.until( EC.presence_of_element_located((By.CLASS_NAME, 'actions'))) html_doc = browser.page_source soup = BeautifulSoup(html_doc, "html.parser") # extract hood from page neighborhood = '' if "This unit is not currently listed on StreetEasy" in soup.text: print "\nNO LONGER LISTED\n" details = soup.find_all("div", class_="details_info") try: neighborhood = details[0].text.strip().split(" in ")[1] except IndexError: neighborhood = details[0].text.strip() else: nobreaks = soup.find_all("span", class_="nobreak") for nobreak in nobreaks: if nobreak.find("a") != None: neighborhood = nobreak.a.text # update hood in mongo if neighborhood != '': print neighborhood result = coll.update_one( {"_id": listing['_id']}, {"$set": { "neighborhood": neighborhood }}) print "modified:", result.modified_count except TimeoutException: print "page took too long to load..."
def test_commute(destination_name): coll = mongo_connect(MONGO_USER, MONGO_PW) ids = coll.find({ destination_name + "distance_m": { "$exists": True } }).limit(5) for m_id in ids: pprint.pprint(m_id) return None
def update_places(): """one time batch run for adding places info to existing data """ coll = mongo_connect(MONGO_USER, MONGO_PW) #listings = coll.find({"no_fee": {"$regex": u"NO"}}).limit(10) ids = coll.find({"coffee_count": { "$exists": False }}, { '_id': 1, 'latlong': 1 }) for m_id in ids: place_fields = place_agg(m_id['latlong']) coll.update_one({"_id": m_id['_id']}, {"$set": place_fields})
def main(): """pull all data from mongo into a csv """ coll = mongo_connect(MONGO_USER, MONGO_PW) #listings = coll.find({"no_fee": {"$regex": u"NO"}}).limit(10) listings = coll.find() df = pd.DataFrame(list(listings)) #pprint.pprint(listings[0]) pprint.pprint(df.columns.values) df['coffee_names'] = df.apply( lambda row: replace_unicode(row['coffee_names']), axis=1) df['grocery_names'] = df.apply( lambda row: replace_unicode(row['grocery_names']), axis=1) print df.head() df.to_csv("./mongo_listings.csv", encoding='utf-8')
def add_commute(destination_address, destination_name): """add a commute time for a new place of work """ coll = mongo_connect(MONGO_USER, MONGO_PW) ids = coll.find({ destination_name + "distance_m": { "$exists": False } }, { '_id': 1, 'latlong': 1 }).limit(2500) #set up google maps client/variables api_key = SECRET_KEY client = googlemaps.Client(key=api_key) origins_dict = {destination_name: destination_address} for m_id in ids: ll = m_id["latlong"].split(',') latlng = {"lat": float(ll[0]), "lng": float(ll[1])} destinations = [latlng] for prefix, addr in origins_dict.iteritems(): origins = [addr] commute_payload, client = \ commute_time(origins, destinations, api_key, client) for key in commute_payload.keys(): #key = prefix + key commute_payload[prefix + key] = commute_payload.pop(key) result = coll.update_one({"_id": m_id['_id']}, {"$set": commute_payload}) #test = coll.find({"_id": m_id['_id']}) #pprint.pprint(test[0]) #print "modified:", result.modified_count return None
else: return result if __name__ == '__main__': sys.path.append(os.path.dirname(__file__)) KEY = config.conf['telegram']['token'] MY_ID = config.conf['telegram']['my_id'] max_message_size = config.conf['telegram']['max_message_size'] # mongo host = config.conf['mongo']['host'] port = int(config.conf['mongo']['port']) db = config.conf['mongo']['db'] collection = config.conf['mongo']['collection'] db = mongo.mongo_connect(host, port, db) collection = db[collection] # lepra client_id = config.conf['lepra']['client_id'] text_documents = ['text', 'contact', 'location', 'venue'] file_documents = ['photo', 'document', 'audio', 'video', 'voice'] bot = create_bot(KEY) bot.message_loop(handle) # print 'Listening ...' # Keep the program running. while 1: time.sleep(10)
logging.debug(ex) def main(disk_bot): disk_bot.message_loop({'chat': on_chat_message}) logging.info('Listening ...') while 1: time.sleep(10) token = config.token disk_bot = create_bot(token) host = config.host port = int(config.port) db = config.db collection = config.collection client, db, collection = mongo.mongo_connect(host, port, db, collection) text_documents = ['text', 'contact', 'location', 'venue'] file_documents = ['photo', 'document', 'audio', 'video', 'voice'] logging.basicConfig( format=u'[%(asctime)s] %(filename)s[LINE:%(lineno)d] %(message)s', level=logging.DEBUG, filename=u'/var/log/YaDisk/t_bot.log') if __name__ == '__main__': main(disk_bot)
post = re.sub(r'" alt=.*height="[0-9]+', ' ', post) post = re.sub(r' <div class="b-external_image.*media.giphy.com', ' ', post) post = re.sub(r'" border="[0-9]', ' ', post) post = re.sub(r'" width=.*px', ' ', post) post = re.sub(r'" data-start_time="[0-9]+', ' ', post) post = re.sub(r'title="', ' ', post) post = re.sub(r'" rel="coub".*data-start_time="[0-9]+', ' ', post) post = re.sub(r' rel="coub".*data-preview_height="[0-9]+', ' ', post) return post sites = config.conf['lepra']['sites'] db_name = config.conf['lepra']['db_name'] for site in sites: client, db, collection = mongo.mongo_connect(db_name, site) html = get_posts(site) if not html: continue soup = BeautifulSoup(html) results = soup.findAll("div", {"class": re.compile("^(post.*)$")}) for post_html in results: try: post = post_html.find('div', {'class': 'dti p_body'}) post_num = re.match(r'<div class="post.*"? id="p([0-9]+)" data', str(post_html)) post_num = post_num.group(1) url = "https://{}.leprosorium.ru/comments/{}/".format(site, post_num) # url = post_html.find('span', {'class': 'b-post_comments_links'}).a.get('href') post = str(post)