Exemplo n.º 1
0
def main():
    accounts = config.conf['instagram']['accounts']

    client, db, collection = mongo.mongo_connect('instagramm', 'media')

    for account in accounts:
        data = get_urls(account)
        for date in data:
            insta_url = data[date].pop()
            img_url = data[date].pop()
            id = mongo.ObjectId(str(insta_url[-1:-13:-1]))

            if mongo.check_id(id, collection):
                post = {"_id": id,
                        "author": account,
                        "date": date,
                        "img_url": img_url,
                        "insta_url": insta_url}

                collection.insert_one(post)
                media = requests.get(img_url, stream=True)
                if media.status_code == 200:
                    path = '/tmp/' + date + '.jpeg'
                    with open(path, 'wb') as f:
                        for chunk in media.iter_content(1024):
                            f.write(chunk)
                        # telegramm_bot.send_message(path, type='photo')
                        telegramm_bot.send_message(insta_url, type='text')
Exemplo n.º 2
0
def query():
    """fill in the listings in mongo with null neighborhoods!
	"""
    coll = mongo_connect(MONGO_USER, MONGO_PW)
    count = coll.find({"neighborhood": "Unknown"}).count()

    print count
Exemplo n.º 3
0
def fill_null_neighborhoods():
    """fill in the listings in mongo with null neighborhoods!
	"""
    coll = mongo_connect(MONGO_USER, MONGO_PW)
    listings = coll.find({
        "neighborhood": None
    }, {
        '_id': 1,
        'url': 1
    }).sort("_id", -1)

    # print len(listings), "listings to fix"

    browser = webdriver.Chrome(
        executable_path="/Users/davidhey/Documents/chromedriver")
    browser.set_window_position(0, 0)
    browser.set_window_size(400, 1000)

    for listing in listings:
        pprint.pprint(listing)
        browser.get(listing['url'])
        try:
            wait = WebDriverWait(browser, 30)
            wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, 'actions')))

            html_doc = browser.page_source
            soup = BeautifulSoup(html_doc, "html.parser")

            # extract hood from page
            neighborhood = ''
            if "This unit is not currently listed on StreetEasy" in soup.text:
                print "\nNO LONGER LISTED\n"
                details = soup.find_all("div", class_="details_info")
                try:
                    neighborhood = details[0].text.strip().split(" in ")[1]
                except IndexError:
                    neighborhood = details[0].text.strip()
            else:
                nobreaks = soup.find_all("span", class_="nobreak")
                for nobreak in nobreaks:
                    if nobreak.find("a") != None:
                        neighborhood = nobreak.a.text

            # update hood in mongo
            if neighborhood != '':
                print neighborhood
                result = coll.update_one(
                    {"_id": listing['_id']},
                    {"$set": {
                        "neighborhood": neighborhood
                    }})
                print "modified:", result.modified_count
        except TimeoutException:
            print "page took too long to load..."
Exemplo n.º 4
0
def test_commute(destination_name):
    coll = mongo_connect(MONGO_USER, MONGO_PW)
    ids = coll.find({
        destination_name + "distance_m": {
            "$exists": True
        }
    }).limit(5)
    for m_id in ids:
        pprint.pprint(m_id)

    return None
Exemplo n.º 5
0
def update_places():
    """one time batch run for adding places info to existing data
	"""
    coll = mongo_connect(MONGO_USER, MONGO_PW)
    #listings = coll.find({"no_fee": {"$regex": u"NO"}}).limit(10)
    ids = coll.find({"coffee_count": {
        "$exists": False
    }}, {
        '_id': 1,
        'latlong': 1
    })
    for m_id in ids:
        place_fields = place_agg(m_id['latlong'])
        coll.update_one({"_id": m_id['_id']}, {"$set": place_fields})
Exemplo n.º 6
0
def main():
    """pull all data from mongo into a csv
	"""
    coll = mongo_connect(MONGO_USER, MONGO_PW)
    #listings = coll.find({"no_fee": {"$regex": u"NO"}}).limit(10)
    listings = coll.find()
    df = pd.DataFrame(list(listings))
    #pprint.pprint(listings[0])
    pprint.pprint(df.columns.values)
    df['coffee_names'] = df.apply(
        lambda row: replace_unicode(row['coffee_names']), axis=1)
    df['grocery_names'] = df.apply(
        lambda row: replace_unicode(row['grocery_names']), axis=1)
    print df.head()
    df.to_csv("./mongo_listings.csv", encoding='utf-8')
Exemplo n.º 7
0
def add_commute(destination_address, destination_name):
    """add a commute time for a new place of work
	"""
    coll = mongo_connect(MONGO_USER, MONGO_PW)
    ids = coll.find({
        destination_name + "distance_m": {
            "$exists": False
        }
    }, {
        '_id': 1,
        'latlong': 1
    }).limit(2500)

    #set up google maps client/variables
    api_key = SECRET_KEY
    client = googlemaps.Client(key=api_key)
    origins_dict = {destination_name: destination_address}

    for m_id in ids:
        ll = m_id["latlong"].split(',')
        latlng = {"lat": float(ll[0]), "lng": float(ll[1])}
        destinations = [latlng]

        for prefix, addr in origins_dict.iteritems():
            origins = [addr]
            commute_payload, client = \
             commute_time(origins, destinations, api_key, client)
            for key in commute_payload.keys():
                #key = prefix + key
                commute_payload[prefix + key] = commute_payload.pop(key)

            result = coll.update_one({"_id": m_id['_id']},
                                     {"$set": commute_payload})

            #test = coll.find({"_id": m_id['_id']})
            #pprint.pprint(test[0])

            #print "modified:", result.modified_count
    return None
Exemplo n.º 8
0
    else:
        return result


if __name__ == '__main__':
    sys.path.append(os.path.dirname(__file__))

    KEY = config.conf['telegram']['token']
    MY_ID = config.conf['telegram']['my_id']
    max_message_size = config.conf['telegram']['max_message_size']
    # mongo
    host = config.conf['mongo']['host']
    port = int(config.conf['mongo']['port'])
    db = config.conf['mongo']['db']
    collection = config.conf['mongo']['collection']
    db = mongo.mongo_connect(host, port, db)
    collection = db[collection]
    # lepra
    client_id = config.conf['lepra']['client_id']

    text_documents = ['text', 'contact', 'location', 'venue']
    file_documents = ['photo', 'document', 'audio', 'video', 'voice']

    bot = create_bot(KEY)
    bot.message_loop(handle)
    # print 'Listening ...'

    # Keep the program running.
    while 1:
        time.sleep(10)
Exemplo n.º 9
0
    logging.debug(ex)


def main(disk_bot):
    disk_bot.message_loop({'chat': on_chat_message})
    logging.info('Listening ...')

    while 1:
        time.sleep(10)


token = config.token

disk_bot = create_bot(token)

host = config.host
port = int(config.port)
db = config.db
collection = config.collection

client, db, collection = mongo.mongo_connect(host, port, db, collection)
text_documents = ['text', 'contact', 'location', 'venue']
file_documents = ['photo', 'document', 'audio', 'video', 'voice']
logging.basicConfig(
    format=u'[%(asctime)s] %(filename)s[LINE:%(lineno)d] %(message)s',
    level=logging.DEBUG,
    filename=u'/var/log/YaDisk/t_bot.log')

if __name__ == '__main__':
    main(disk_bot)
Exemplo n.º 10
0
    post = re.sub(r'" alt=.*height="[0-9]+', ' ', post)
    post = re.sub(r' <div class="b-external_image.*media.giphy.com', ' ', post)
    post = re.sub(r'" border="[0-9]', ' ', post)
    post = re.sub(r'" width=.*px', ' ', post)
    post = re.sub(r'" data-start_time="[0-9]+', ' ', post)
    post = re.sub(r'title="', ' ', post)
    post = re.sub(r'" rel="coub".*data-start_time="[0-9]+', ' ', post)
    post = re.sub(r' rel="coub".*data-preview_height="[0-9]+', ' ', post)
    return post


sites = config.conf['lepra']['sites']
db_name = config.conf['lepra']['db_name']

for site in sites:
    client, db, collection = mongo.mongo_connect(db_name, site)

    html = get_posts(site)
    if not html:
        continue
    soup = BeautifulSoup(html)
    results = soup.findAll("div", {"class": re.compile("^(post.*)$")})
    for post_html in results:
        try:
            post = post_html.find('div', {'class': 'dti p_body'})
            post_num = re.match(r'<div class="post.*"? id="p([0-9]+)" data', str(post_html))
            post_num = post_num.group(1)
            url = "https://{}.leprosorium.ru/comments/{}/".format(site, post_num)
            # url = post_html.find('span', {'class': 'b-post_comments_links'}).a.get('href')

            post = str(post)