def parse_post(post): # Extract text and strip html tags and links content = strip_html_tags(post['the_post']['rawContent']) links = extract_urls(content) for url in links: content = content.replace(url, '') try: images = list( map(lambda x: x['cdnUrl'], post['the_post']['entities']['images'])) except KeyError: images = [] return content, links, images
def get_urls(): start = time.perf_counter() pokedex = utils.get_data("pokedex.json") urls = [] success = 0 fail = 0 alola = 0 for i in range(1, len(pokedex) + 1): pokemon = pokedex[str(i)] print_prefix = f"{str(i).zfill(3)} {pokemon}" pokemon = quote(pokemon) if i == 29: pokemon += "♀" elif i == 32: pokemon += "♂" api_url = f"https://bulbapedia.bulbagarden.net/w/api.php?action=parse&format=json&page={pokemon}_(Pokémon)" r = requests.get(api_url) if r.status_code == 200: lst = utils.extract_urls(r.text) if lst: url = lst[2] image_url = "https://" + url[:url.rfind("/")].replace( "/thumb/", "/") urls.append(image_url) print(f"{print_prefix:<20} > {image_url}") success += 1 for url in lst: if pokemon.replace(" ", "_") + "-Alola" in url: image_url = "https://" + url[:url.rfind("/")].replace( "/thumb/", "/") urls.append(image_url) print_prefix += "-alola" print(f"{print_prefix:<20} > {image_url}") alola += 1 break else: print(f"{print_prefix:<20} > Could not find a URL") fail += 1 else: print(f"{print_prefix:<20} > {r.status_code} {r.reason}") fail += 1 elapsed_time = time.perf_counter() - start print( f"\nFound {success + alola}/{success + fail + alola} ({success} + {alola}) URLs in {elapsed_time:0.02f} seconds.\n" ) return urls
parser.add_argument("--url", type=str, help="The link to the repo that you want to check") parser.add_argument("--show-invalid-only", action="store_true", help="By default, only invalid urls are printed.") args = parser.parse_args() url = args.url if url.endswith(".git"): url = url[:-4] giturl = osp.join(url, "blob/master") if osp.exists("temp/"): shutil.rmtree("temp/") git.Repo.clone_from(url, "temp") from utils import extract_urls, test_url_availability for url, fname, lidx in extract_urls(folder="temp/"): available = test_url_availability(url) if available and args.show_invalid_only: continue status = "valid" if available else "invalid" print("[%s]" % status, url) rel_path = "/".join(fname.split("/")[1:]) print("\t", "It is in %s#L%d" % (osp.join(giturl, rel_path), lidx + 1)) if osp.exists("temp/"): shutil.rmtree("temp/")
def prominent_domains(js, keyw, domains=set(), extend_search=True): """ Given keywords of a site and domains returned by a google search, check which domains are found from the site. Split the result on primary and secondary targets. Primary target is a domain that is found from the keywords or in the domains guesses from the domain. Secondary target is a domain that is not found from the keywords, but appears elsewhere in the site. Parameters ---------- js : json object contains site data keyw : list contains site keywords domains : set set of tuples (mld, ps) extend_search : boolean whether to look for prominent domains from text and links as well Returns ------- prominent : set set of string "mld.ps" that either appear in keywords or can be guessed from the keywords """ prominent = set() mld_guesses = keywords.guess_mld(js) url_tokens = re.split('\W+', (js['starturl'] + ' ' + js['landurl']).lower()) title_tokens = re.split('\W+', js['title'].lower()) # logger.print("checking for prominent domains:") for mld, ps in domains: mld = mld.lower() ps = ps.lower() # segments = ngrams.segment(mld) if mld in keyw: logger.print("mld found from keywords: {}.{}".format(mld, ps), nots=True) prominent.add('.'.join([mld, ps])) # prominent.add((mld, ps)) elif mld in mld_guesses: logger.print("mld found from mld-guessing: {}.{}".format(mld, ps), nots=True) prominent.add('.'.join([mld, ps])) # prominent.add((mld, ps)) # elif extend_search and ' '.join(segments) in ' '.join(js['text'].lower().split()) and mld not in STOPMLDS: # logger.print("found by segmentation from text: {}.{}".format(mld, ps), nots=True) # prominent.add('.'.join([mld, ps])) # elif all(item in title_tokens for item in segments): # logger.print("found by segmentation from title: {}.{}".format(mld, ps), nots=True) # prominent.add('.'.join([mld, ps])) # # prominent.add((mld, ps)) elif mld in url_tokens: logger.print("mld in url: {}.{}".format(mld, ps), nots=True) prominent.add('.'.join([mld, ps])) # prominent.add((mld, ps)) if extend_search: link_domains = set( keywords.split_mld_ps(link) for link in utils.extract_urls(js['source'])) link_domains |= set( keywords.split_mld_ps(link) for link in js['loglinks']) # remove mlds that often occur: google, blogger, ... These are STOPMLDS link_domains = set( (mld, ps) for (mld, ps) in link_domains if mld not in STOPMLDS) for dom in domains: if dom in link_domains and dom not in prominent: logger.print("mld found from links: {}.{}".format(*dom), nots=True) prominent.add('.'.join(dom)) # prominent.add(dom) return prominent
def url_worker(urlinput, urloutput): for comment_id, body in iter(urlinput.get, 'STOP'): url_set = extract_urls(body) urloutput.put((comment_id, url_set))
import git, os from utils import extract_urls, test_url_availability # os.makedirs("temp", exist_ok=True) # git.Repo.clone_from("https://github.com/mit-han-lab/proxylessnas.git", "temp") # git.Repo.clone_from("https://github.com/mit-han-lab/AMC.git", "temp") giturl = "https://github.com/mit-han-lab/AMC/blob/master/" for url, fname, lidx in extract_urls(folder="."): print(url) print("\t", test_url_availability(url)) print("\t", "%s%s#L%d" % (giturl, fname.replace("temp/", ""), lidx + 1))