def test_cacheing(self): self.assertEqual( self.resolved_urls, urlexpander.expand(self.urls, cache_file='__cache.json')) self.assertEqual( self.resolved_urls, urlexpander.expand(self.urls, cache_file='__cache.json')) os.remove('__cache.json')
def link_report(df): tweet_df = df['Tweet'] all_tweet_links = "" for tweet in tweet_df: all_tweet_links += tweet + " " all_urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', all_tweet_links) print("*" * 50) print("Total number of links : " + str(len(all_urls))) print("*" * 50) links = {'Links': all_urls} link_df = pd.DataFrame(links, columns=['Links']) domain_count = {} for link in link_df['Links']: url = uex.expand(link) domain = uex.get_domain(url) # print(domain) if domain in domain_count: domain_count[domain] += 1 else: domain_count[domain] = 1 # print(domain_count) domain_df = pd.DataFrame.from_dict(domain_count, orient='index', columns=['Count']) print(domain_df.sort_values(by=['Count'], ascending=False).to_string())
def create_reddit_actions(es, lines_json, tmp_filename, calc_embeddings=False, expand_urls=False): urls_dict = {} all_urls = [] actions = [] for post in lines_json: try: urls = extractor.find_urls(str(post['body'])) except AttributeError: post['smapp_urls'] = [] continue post['smapp_urls'] = urls all_urls.extend(urls) all_urls = [url for url in all_urls if 'reddit.com' not in url] if expand_urls: expanded_urls = urlexpander.expand(all_urls, chunksize=1280, n_workers=64, cache_file=tmp_filename) urls_dict = dict(zip(all_urls, expanded_urls)) for post_num, post in enumerate(lines_json): post = preprocess_reddit_post(post, calc_embeddings, urls_dict) period = str(pd.to_datetime( post['created_utc'], unit='s').to_period('M')) index_name = f'smapp_reddit_{period}' action = { "_index": index_name, "_type": '_doc', "_id": str(post['id']), "_source": post, } actions.append(action) return actions
def _parse_signup_url(self, url: str) -> Tuple[ParseResult, dict]: expanded_url = self._url_cache.get(url, urlexpander.expand(url)) if "list-manage.com/subscribe" not in expanded_url: raise ValueError( "It doesn't look like you gave us a MailChimp URL form") ps = urlparse(expanded_url) ps = ps._replace(path=f"{ps.path}/post") qs = query_string.parse(ps.query) return ps, qs
def yesorno(request): thetesturl = request.GET.get("url") if not validators.url(thetesturl): response = {"status": False, "data": {"error": "not a url"}} return JsonResponse(response, json_dumps_params={'indent': 2}, status=200) if "latLmes" in thetesturl: response = { "status": True, "data": { "url": "latLmes.com", "rickroll": True } } return JsonResponse(response, json_dumps_params={'indent': 2}) testurl = urlexpander.expand(thetesturl).replace("www.", "") query = urlparse(testurl) path = query.path if query.hostname == "rickroll-links-database.ch1ck3n.repl.co": response = { "status": True, "data": { "url": "repl.co", "rickroll": False } } return JsonResponse(response, json_dumps_params={'indent': 2}) if query.hostname == "theraleighregister.com": response = { "status": True, "data": { "url": "theraleighregister.com", "rickroll": "true" } } return JsonResponse(response, json_dumps_params={'indent': 2}) if path == "/": path == "" if query.query: print(query.query) path = path + "?" + query.query hostname = str(query.hostname) if hostname == "None": hostname = "" response = { "status": True, "data": { "url": str(hostname) + str(path), "rickroll": str(str(hostname) + str(path) in links) } } return JsonResponse(response, json_dumps_params={'indent': 2})
def unpack_google_url(): try: short_url = request.args.get('short_url') response_url = urlexpander.expand(short_url) pattern = '@(\-?\d+.\d+),(\-?\d+.\d+)' res = re.search(pattern, response_url) if res is not None: result = {'lat': res.group(1), 'lng': res.group(2), 'status': 'OK'} else: result = {'status': 'unable to parse', 'url': response_url} return json.dumps(result) except Exception as e: result = {'status': 'error', 'description': str(e)} return json.dumps(result)
def unshorturls(proxy: Cut, batch=False): """Resolves shortened urls Arguments: proxy {Cut} -- Twitter status object (dict) under scalpl access Keyword Arguments: batch {bool} -- [description] (default: {False}) Returns: [list] -- List of tuples with (shortened url, unshortened url, domain) """ KEY_URLS = ['urls','media','quoted_status.media','quoted_status.urls','retweeted_status.media','retweeted_status.urls'] shortened_urls = [] for k in KEY_URLS: if k in proxy: for i in range(0,len(proxy[k])): cur_k_urls_expanded = k + "[%d].expanded_url" % i if cur_k_urls_expanded in proxy: c_url_expanded = proxy[cur_k_urls_expanded] not_resolved = False #if urlexpander.is_short(c_url_expanded) or "lajunta.es" in c_url_expanded: if batch == False: try: proxy[cur_k_urls_expanded] = urlexpander.expand(c_url_expanded, filter_function=__custom_filter) except: not_resolved = True logger.warning("I can't expand: %s" % proxy[cur_k_urls_expanded]) if batch == False and not_resolved == False: # Obtain a new key with only domain parsed_uri = urlparse(proxy[cur_k_urls_expanded]) domain = '{uri.netloc}'.format(uri=parsed_uri) cur_k_urls_expanded_domain = cur_k_urls_expanded.replace('expanded_url','expanded_domain') proxy[cur_k_urls_expanded_domain] = domain shortened_urls += [(c_url_expanded, proxy[cur_k_urls_expanded], domain)] else: shortened_urls += [(c_url_expanded, None, None)] return shortened_urls
def test_expand_many(self): self.assertEqual(self.resolved_urls, urlexpander.expand(self.urls))
def test_expand_one(self): self.assertEqual(self.resolved_urls[0], urlexpander.expand(self.urls[0]))
break for tweet in new_tweets: tweet_list.append(tweet.full_text) #print(tweet.full_text) print(tweet.url) tweetCount += len(new_tweets) print("Found {0} tweets".format(tweetCount)) max_id = new_tweets[-1].id except tweepy.TweepError as e: # # Just exit if any error print("some error : " + str(e)) break print ("Downloaded {0} tweets.".format(tweetCount)) for tweet in tweet_list: songUrls = re.findall(urlmarker.URL_REGEX, tweet) #songUrls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', tweet) for songUrl in songUrls: print(songUrl) expanded = urlexpander.expand(songUrl) print(expanded) if "spotify" in songUrl: songUri = getUri(songUrl) if songUri is not None: print("added a song!") requests.post("https://api.spotify.com/v1/playlists/34PelCJCvwUvOQIbhoN0he/tracks?uris="+songUri+" Accept: application/json"+" Content-Type: application/json"+" Authorization: Bearer "+spotifyAuth) print("added a song!3223123123") else: print("not a track, can't add!")
def createNotionTask(token, collectionURL, content, url): def convertImagePath(imagePath, mdFilePath): parsed_url = urllib.parse.urlparse(url) domain = parsed_url.scheme + '://' + parsed_url.netloc relative_url = os.path.abspath( str(pathlib.Path().absolute()) + '/images/' + imagePath) new_url = urllib.parse.urljoin(domain, imagePath) r = http.request('GET', new_url) img = r.data os.makedirs(os.path.dirname(relative_url), exist_ok=True) with open(relative_url, 'wb') as f: f.write(img) return Path( os.path.abspath(str(pathlib.Path().absolute()) + imagePath)) if (content): client = NotionClient(token) cv = client.get_collection_view(collectionURL) print(cv.collection.parent.views) row = cv.collection.add_row() if ('task:' in content): content = content.replace('task:', '') if ('Task:' in content): content = content.replace('Task:', '') row.title = content if (url and "http://ifttt.com/missing_link" not in url): expanded_url = urlexpander.expand(url) if ('imgur' in expanded_url): if 'gallery/' in expanded_url: gallery = expanded_url.split('gallery/')[1] client = ImgurClient(client_id, client_secret) items = client.get_album_images(gallery) imgur_object = "" for item in items: img = "<img src='" + item.link + "' /><br>" imgur_object += img text = prettierfier.prettify_html(imgur_object) doc = Document(text) text = doc.summary() output = pypandoc.convert_text(text, 'gfm-raw_html', format='html') if (output == ""): page = row.children.add_new(BookmarkBlock) page.link = url page.title = content else: rendered = convert(output) for blockDescriptor in rendered: uploadBlock(blockDescriptor, row, content, imagePathFunc=convertImagePath) else: # try: # row.url = url # # http = urllib3.PoolManager() # r = http.request('GET', url) # # text = prettierfier.prettify_html(str(r.data)) # soup = BeautifulSoup(str(r.data)) # metas = soup.find_all('meta') # doc = Document(text) # text = doc.summary() # print(metas) # output = pypandoc.convert_text(text, 'gfm-raw_html', format='html') # output = output.replace('\\\\n', '') # output = output.replace('\\\\t', '') # output = output.replace("\\\\'", "\'") # print(output) # # # if (output == ""): # print("wtf1") # raise ValueError('No website data') # # rendered = convert(output) # # # Upload all the blocks # for blockDescriptor in rendered: # uploadBlock(blockDescriptor, row, doc.title(),imagePathFunc=convertImagePath) # except: page = row.children.add_new(BookmarkBlock) page.link = url page.title = content else: row.children.add_new(TextBlock, title=content) # shutil.rmtree(Path(str(pathlib.Path().absolute()) + '/images/'), ignore_error=True) return content
def expand_urls(config): short_link_services = [ 'bit.ly', 'dlvr.it', 'liicr.nl', 'tinyurl.com', 'goo.gl', 'ift.tt', 'ow.ly', 'fxn.ws', 'buff.ly', 'back.ly', 'amzn.to', 'nyti.ms', 'nyp.st', 'dailysign.al', 'j.mp', 'wapo.st', 'reut.rs', 'drudge.tw', 'shar.es', 'sumo.ly', 'rebrand.ly', 'covfefe.bz', 'trib.al', 'yhoo.it', 't.co', 'shr.lc', 'po.st', 'dld.bz', 'bitly.com', 'crfrm.us', 'flip.it', 'mf.tt', 'wp.me', 'voat.co', 'zurl.co', 'fw.to', 'mol.im', 'read.bi', 'disq.us', 'tmsnrt.rs', 'usat.ly', 'aje.io', 'sc.mp', 'gop.cm', 'crwd.fr', 'zpr.io', 'scq.io', 'trib.in', 'owl.li', 'youtu.be', ] urls_table = pd.read_csv(os.path.join( config["PATHS"]["INTERMEDIATE_DATA_DIR"], "tweet_url_table.csv"), usecols=["tweet_id", "url"]) urls_tweet_id = dict() for ix, row in urls_table.iterrows(): url = row["url"] tweet_id = row["tweet_id"] domain = extract_top_domain(url) if domain in short_link_services: urls_tweet_id[url] = tweet_id print("No. urls to expand: " + str(urls_tweet_id.__len__())) q = queue.Queue() def expand_domain(short_url): expanded_url = infer_base_url(short_url) top_domain = extract_top_domain(expanded_url) q.put([short_url, expanded_url, top_domain]) print("Working on {}, {} ".format(short_url, len(q.queue))) with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor: executor.map(expand_domain, list(urls_tweet_id.keys())) res_df = pd.DataFrame(list(q.queue), columns=['short_url', 'expanded_url', 'top_domain']) print("Updating links") expanded_urls_dict = dict() for ix, row in res_df.iterrows(): old = row["short_url"] new = row["expanded_url"] if old == new: # it wasn't expanded: lets try with urlexpander try: new_v2 = urlexpander.expand(old) if new_v2: new = new_v2 except: pass expanded_urls_dict[old] = new pkl.dump( expanded_urls_dict, open( os.path.join(config["PATHS"]["INTERMEDIATE_DATA_DIR"], "urls_expanded.pkl"), "wb"))