def consolidate(infile="data/all_raw_cleaned.csv", outfile="data/all_metadata.csv"): good = read_meta() total = 0 with open(infile, "r", errors="ignore") as inf, open(outfile, 'w') as outf: reader = csv.reader(inf, delimiter=",") w = csv.writer(outf, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for line in reader: total += 1 while len(line) < 16: line.append("") url = line[1].replace("www.", "") if url in good: # add title if not there if not is_bad(good[url][1]) and is_bad(line[2]): line[2] = good[url][1] # add description if not is_bad(good[url][2]): line[14] = good[url][2] # add locale if not is_bad(good[url][3]): line[15] = good[url][3] w.writerow(line) print(str(total), end="\r")
def locale_to_country(infile="data/all_metadata.csv", outfile="data/all_metadata2.csv"): total = 0 with open(infile, "r", errors="ignore") as inf, open(outfile, 'w') as outf: reader = csv.reader(inf, delimiter=",") w = csv.writer(outf, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for line in reader: total += 1 if not is_bad(line[15]): country, language = parse_locale(line[15]) if language != line[3] and not is_bad(language): line[3] = language if country != line[0] and not is_bad(country): line[0] = country w.writerow(line) print(str(total), end="\r")
def get_graph_spec(source): q = '' if helpers.is_bad(source[1]): print(source[1]) return q if source[1].find('.') == -1: return q url = '<http://' + urllib.parse.quote(source[1]) + '>' url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>' graph = """ GRAPH """ + url #url q += ("DELETE WHERE" + graph + """ {?item wdt:P17 ?country.}}; INSERT DATA { """ + graph + "{" + url_item + " wdt:P17 """ + urllib.parse.quote(source[1]) + """\' }} WHERE """ + match + ";" ) #country if not helpers.is_bad(source[0]): country_code = get_country_code(source[0]) if not helpers.is_bad(country_code): c = country_code else: c = helpers.clean(source[0]) match = "{" + graph + "{ ?item wdt:P17 ?country}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P17 " + c + """ }} WHERE """ + match + ";" ) return q
def get_sources(): sources = {} pref = '/Users/lavanyasingh/Desktop/GSC2O19internet_archive/data/cleaned' paths = os.listdir(pref) paths.remove('.DS_Store') for path in paths: total = 0 with open('data/cleaned/' + path, 'r') as inf: reader = csv.reader(inf, delimiter=',') next(reader) for item in reader: if item[1].find('subject=') != -1: None else: total += 1 url = helpers.truncate(item[1]) item[1] = url val = check_sources(list(sources.keys()), url, path) if val != -1: for i in range(len(item)): if not helpers.is_bad(item[i]): sources[val][i] = item[i] if len(url) < len(val): sources[val][1] = url else: item += ['' for i in range(10)] if (item[0] == "United States" or item[0].lower() == "us" or item[0].lower() == "usa"): item[0] == "United States of America" if path != 'sheet_cleaned.csv': item[7] = get_meta(path) sources.update({url: item}) if total % 2500 == 0: print(path, total) print(path, total) return sources
def get_coords(sources): data = {} coord = "" for source in sources: if not helpers.is_bad(source[9]): if not helpers.is_bad(source[0]): coord = get_country_city_coord(source[0], source[9]) else: coord = get_city_coord(source[9]) elif (not helpers.is_bad(source[8]) and source[0] in ["united states", "unitedstatesofamerica", "us", "usa"]): coord = get_state_coord_us(source[9]) elif not helpers.is_bad(source[0]): coord = get_country_coord(source[0]) if coord != "": data.update({source[1]: coord}) return data
def make_cleaned(): cleaned = [] for url in sources: line = sources[url] if url in good: line.append(codes[url][1]) line.append(meta[url][1]) if helpers.is_bad(line[2]) or line[7].find("original") == -1: line[2] = meta[url][0] cleaned.append(line) return cleaned
def make_all_data(self, infile): rows = {} with open('data/' + infile, 'r') as inf: reader = csv.reader(inf, delimiter=',') for line in reader: # clean URL url = helpers.truncate(line[1]) line[1] = url if len(rows) % 5000 == 0: print(url) # get path ('/money' or '/index' for example) o = urllib.parse.urlparse(self.prep_url(line[1])) path = o.path # add metasource metasource = self.clean_meta(line[7]) line[7] = [metasource] # check for various spellings of USA if (line[0] == "United States" or line[0].lower() == "us" or line[0].lower() == "usa"): line[0] == "United States of America" # extend row line += ['', ''] # if unique url, add path if url not in rows: if self.url_is_good(url): if self.path_is_good(path): line[13] = [path] rows.update({url: line}) else: # add metasource if necessary if metasource not in rows[url][7]: rows[url][7].append(metasource) # update any broken metadata to new value for i in range(len(rows[url]) - 1): if helpers.is_bad(rows[url][i]): try: rows[url][i] = line[i] except: pass # add path if good if self.path_is_good(path): rows[url][13].append(path) return rows
def process(self): total = 0 size = 1000000 random.seed() countries = [] with open(self.infile, 'r') as inf, open(self.outfile, 'w') as outf: reader = csv.reader(inf, delimiter=',') w = csv.writer(outf, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) w.writerow(["Country", "URL", "Title", "Size"]) for line in reader: total += 1 print(str(total), end="\r") # skip entries with broken titles if helpers.is_bad(line[2]): continue # skip countries we've already seen and bad countries if line[0] in countries or helpers.is_bad(line[0]): continue size = size / 1.05 if total < 5: size = size * 5 try: int(line[2].replace("Q", "")[0:3]) continue except: countries.append(line[0]) row = [line[0], line[1], line[2], size] w.writerow(row)
def get_graph_spec(info): url_raw, metasource = info[0], info[1] q = '' if helpers.is_bad(url_raw): print(url_raw) return q if url_raw.find('.') == -1: return q url = '<http://' + urllib.parse.quote(url_raw) + '>' url_item = '<http://' + urllib.parse.quote(url_raw) + '/item>' graph = """ GRAPH """ + url ms = helpers.strip_spaces(metasource) q = "INSERT {" + graph + "{" + url_item + "wnp:metasource wni:" + ms + """}} WHERE {FILTER (EXISTS {""" + graph + """{?s ?p ?o} } && NOT EXISTS {""" + graph + "{ ?item wnp:metasource wni:" + ms + "}})};" return q
def make_all_data(): total, uq = 0, 0 rows = {} for path in paths: print(path) with open('data/raw/' + path, 'r') as inf: reader = csv.reader(inf, delimiter=',') for line in reader: total += 1 url = clean_url(line[1]) line[1] = url metasource = clean_meta(line[7]) row = line if (row[0] == "United States" or row[0].lower() == "us" or row[0].lower() == "usa"): row[0] == "United States of America" row[7] = [metasource] row = [row[i] for i in range(12)] + ['', ''] if url not in rows: if url_is_good(url): uq += 1 if len(sources[url]) < 10: row[13] = sources[url] else: row[13] = [] rows.update({url: row}) else: rows[url][7].append( metasource ) if metasource not in rows[url][7] else rows[url][7] for i in range(len(rows[url]) - 1): if helpers.is_bad(rows[url][i]): try: rows[url][i] = row[i] except: print(row, i) return "OOPS" if total % 10000 == 0 and url_is_good(url): print(url, rows[url]) print("DONE", path, total, uq) return rows
def process_geo(self): countries = self.read_countries() total = 0 with open(self.infile, 'r') as inf, open(self.outfile, 'w') as outf: reader = csv.reader(inf, delimiter=',') w = csv.writer(outf, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) w.writerow([ "ISO Code", "Latitude", "Longitude", "Country", "URL", "Title" ]) for line in reader: total += 1 # skip entries with broken country values if helpers.is_bad(line[0]): continue country = process.extract(line[0], list(countries.keys()), limit=1)[0][0] print(str(total), end="\r") row = countries[country] + [line[1]] w.writerow(row)
def get_graph_spec(source): q = '' if helpers.is_bad(source[1]): print(source[1]) return q if source[1].find('.') == -1: return q url = '<http://' + urllib.parse.quote(source[1]) + '>' url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>' graph = """ GRAPH """ + url #url q += ("INSERT { " + graph + " {" + url_item + " wdt:P1896 \'" + urllib.parse.quote(source[1]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1896 ?url}})} ;" ) #country if not helpers.is_bad(source[0]): country_code = get_country_code(source[0]) if not helpers.is_bad(country_code): c = country_code else: c = helpers.clean(source[0]) q += (" INSERT { " + graph + " {" + url_item + " wdt:P17 \'" + c + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P17 ?country}})} ;" ) #title if not helpers.is_bad(source[2]): q += (" INSERT { " + graph + " {" + url_item + " wdt:P1448 \'" + helpers.clean(source[2]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1448 ?title}})} ;" ) #language if not helpers.is_bad(source[3]): q += (" INSERT { " + graph + " {" + url_item + " wdt:P37 \'" + helpers.clean(source[3]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P37 ?lang}})} ;" ) #type if not helpers.is_bad(source[4]): q += (" INSERT { " + graph + " {" + url_item + " wdt:P31 \'" + helpers.clean(source[4]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P31 ?type}})} ;" ) #title (native language) if not helpers.is_bad(source[5]): q += (" INSERT { " + graph + " {" + url_item + " wdt:P1704 \'" + helpers.clean(source[5]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1448 ?title_native}})} ;" ) #paywall if not helpers.is_bad(source[6]): q += (" INSERT { " + graph + " {" + url_item + " wnp:paywalled \'" + helpers.clean(source[6]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:paywalled ?pw}})} ;" ) #metasource if not helpers.is_bad(source[7]): q += (" INSERT { " + graph + " {" + url_item + " wnp:metasource wni:" + helpers.strip_spaces(source[7]).lower() + """ }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:metasource ?ms}})} ;" ) #state if not helpers.is_bad(source[8]): q += (" INSERT { " + graph + " {" + url_item + " wdt:P131 \'" + helpers.clean(source[8]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P131 ?state}})} ;" ) #wikipedia name if not helpers.is_bad(source[10]): q += (" INSERT { " + graph + " {" + url_item + " wnp:wikipedia-name \'" + helpers.clean(source[10]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:wikipedia-name ?wp_name}})} ;" ) #redirects? if not helpers.is_bad(source[11]): q += (" INSERT { " + graph + " {" + url_item + " wnp:redirect \'" + helpers.clean(source[11]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:redirect ?rd}})} ;" ) #wikipedia link if not helpers.is_bad(source[12]): q += (" INSERT { " + graph + " {" + url_item + " wnp:wikipedia-page \'" + helpers.clean(source[12]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:wikipedia-page ?wp_page}})} ;" ) return q
def overwrite(self, source): q = '' # check for a valid url if helpers.is_bad(source[1]) or source[1].find('.') == -1: return q # add url to graph url = '<http://' + urllib.parse.quote(source[1].replace( "http://", "").replace("https://", "")) + '>' url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>' graph = """ GRAPH """ + url # add url match = "{" + graph + "{ ?item wdt:P1896 ?url}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P1896 " "" + url + """ }} WHERE {} ;""") # add country if not helpers.is_bad(source[0]): country_code = self.get_country_code(source[0]) if not helpers.is_bad(country_code): c = country_code else: c = helpers.clean(source[0]) match = "{" + graph + "{ ?item wdt:P17 ?country}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P17 " + c + """ }} WHERE {} ;""") # add title if not helpers.is_bad(source[2]): match = "{" + graph + "{ ?item wdt:P1448 ?title}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P1448 \'" + helpers.clean_string(source[2]) + """\' }} WHERE {} ;""") # add language if not helpers.is_bad(source[3]): match = "{" + graph + "{ ?item wdt:P37 ?lang}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P37 \'" + helpers.clean_string(source[3]) + """\' }} WHERE {} ;""") # add type if not helpers.is_bad(source[4]): match = "{" + graph + "{ ?item wdt:P31 ?type}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P31 \'" + helpers.clean_string(source[4]) + """\' }} WHERE {} ;""") # add title (native language) if not helpers.is_bad(source[5]): match = "{" + graph + "{ ?item wdt:P1704 ?title}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P1704 \'" + helpers.clean_string(source[5]) + """\'}} WHERE {} ;""") # add paywall if not helpers.is_bad(source[6]): match = "{" + graph + "{ ?item wnp:paywalled ?pw}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:paywalled \'" + helpers.clean_string(source[2]) + """\' }} WHERE {} ;""") # add metasources for ms in source[7]: if not helpers.is_bad(ms): match = "{" + graph + "{ ?item wnp:metasource ?ms}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:metasource wni:" + helpers.strip_spaces(ms).lower() + """ }} WHERE {} ;""") # add state if not helpers.is_bad(source[8]): match = "{" + graph + "{ ?item wdt:P131 ?state}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P131 \'" + helpers.clean_string(source[8]) + """\' }} WHERE {} ;""") # add wikipedia name if not helpers.is_bad(source[10]): match = "{" + graph + "{ ?item wnp:wikipedia-name ?wp_name}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:wikipedia-name \'" + helpers.clean_string(source[10]) + """\' }} WHERE {} ;""") # add redirects? if not helpers.is_bad(source[11]): match = "{" + graph + "{ ?item wnp:redirect ?rd}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:redirect \'" + helpers.clean_string(source[11]) + """\' }} WHERE {} ;""") # add wikipedia link if not helpers.is_bad(source[12]): match = "{" + graph + "{ ?item wnp:wikipedia-page ?wp_page}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:wikipedia-page \'" + helpers.clean_string(source[12]) + """\' }} WHERE {} ;""") # add description try: if not helpers.is_bad(source[14]): match = "{" + graph + "{ ?item wnp:description ?desc}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:description \'" + helpers.clean_string(source[14]) + """\' }} WHERE {} ;""") except IndexError: None return q
def first_load(self, source): # checks for bad URLs if helpers.is_bad(source[1]) or source[1].find('.') == -1: return '' # insert URL url = '<http://' + urllib.parse.quote(source[1]) + '>' url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>' q = """GRAPH """ + url + """ { """ + url_item + """ wdt:P1896 """ + url # add country if not helpers.is_bad(source[0]): country_code = self.get_country_code(source[0]) if not helpers.is_bad(country_code): q += """; wdt:P17 """ + country_code else: q += """; wdt:P17 \'""" + helpers.clean_string(source[0]) + """\' """ # add title if not helpers.is_bad(source[2]): q += """; wdt:P1448 \'""" + helpers.clean_string(source[2]) + """\' """ # add language if not helpers.is_bad(source[3]): q += """; wdt:P37 \'""" + helpers.clean_string(source[3]) + """\' """ #add type if not helpers.is_bad(source[4]): q += """; wdt:P31 \'""" + helpers.clean_string(source[4]) + """\' """ #add title (native language) if not helpers.is_bad(source[5]): q += """; wdt:P1704 \'""" + helpers.clean_string(source[5]) + """\' """ # add paywall if not helpers.is_bad(source[6]): q += """; wnp:paywalled \'""" + helpers.clean_string( source[6]) + """\' """ # add metasources if not helpers.is_bad(source[7]): q += self.get_ms(source[7]) # add state if not helpers.is_bad(source[8]): q += """; wdt:P131 \'""" + helpers.clean_string(source[8]) + """\' """ # add town if not helpers.is_bad(source[9]): q += """; wdt:P131 \'""" + helpers.clean_string(source[9]) + """\' """ # add wikipedia name if not helpers.is_bad(source[10]): q += """; wnp:wikipedia-name \'""" + helpers.clean_string( source[10]) + "\' " # add redirects? if not helpers.is_bad(source[11]): q += """; wnp:redirect \'""" + helpers.clean_string( source[11]) + """\' """ # add wikipedia link if not helpers.is_bad(source[12]): q += """; wnp:wikipedia-page \'""" + urllib.parse.quote( source[12]) + """\'""" # add paths if not helpers.is_bad(source[13]): q += self.get_path_spec(source[13]) # add description if not helpers.is_bad(source[14]): q += """; wnp:description \'""" + helpers.clean_string(source[14]) + "\'" q += """.}""" return q
def no_overwrite(self, source): q = '' if helpers.is_bad(source[1]): print(source[1]) return q # this means our url is not valid if source[1].find('.') == -1: return q # begin constructing graph spec # construct item item = '<http://' + urllib.parse.quote(source[1]) + '/item>' # construct item URL url = '<http://' + urllib.parse.quote(source[1]) + '>' # construct graph value graph = """ GRAPH """ + url # add URL q += ("INSERT { " + graph + " {" + item + " wdt:P1896 " + url + """ }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1896 ?url}})} ;") # add country if not helpers.is_bad(source[0]): country_code = self.get_country_code(source[0]) if not helpers.is_bad(country_code): c = country_code else: c = helpers.clean(source[0]) q += (" INSERT { " + graph + " {" + item + " wdt:P17 \'" + c + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P17 ?country}})} ;") # add title title if not helpers.is_bad(source[2]): q += (" INSERT { " + graph + " {" + item + " wdt:P1448 \'" + helpers.clean(source[2]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1448 ?title}})} ;") # add language if not helpers.is_bad(source[3]): q += (" INSERT { " + graph + " {" + item + " wdt:P37 \'" + helpers.clean(source[3]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P37 ?lang}})} ;") # add source type if not helpers.is_bad(source[4]): q += (" INSERT { " + graph + " {" + item + " wdt:P31 \'" + helpers.clean(source[4]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P31 ?type}})} ;") # add title in native language if not helpers.is_bad(source[5]): q += (" INSERT { " + graph + " {" + item + " wdt:P1704 \'" + helpers.clean(source[5]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1448 ?title_native}})} ;") # add paywall (Yes or No) if not helpers.is_bad(source[6]): q += (" INSERT { " + graph + " {" + item + " wnp:paywalled \'" + helpers.clean(source[6]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:paywalled ?pw}})} ;") # add metasource if not helpers.is_bad(source[7]): q += (" INSERT { " + graph + " {" + item + " wnp:metasource wni:" + helpers.strip_spaces(source[7]).lower() + """ }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:metasource ?ms}})} ;") # add state if not helpers.is_bad(source[8]): q += (" INSERT { " + graph + " {" + item + " wdt:P131 \'" + helpers.clean(source[8]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P131 ?state}})} ;") # add wikipedia name if not helpers.is_bad(source[10]): q += (" INSERT { " + graph + " {" + item + " wnp:wikipedia-name \'" + helpers.clean(source[10]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:wikipedia-name ?wp_name}})} ;") # add redirect if not helpers.is_bad(source[11]): q += (" INSERT { " + graph + " {" + item + " wnp:redirect \'" + helpers.clean(source[11]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:redirect ?rd}})} ;") # add wikipedia link if not helpers.is_bad(source[12]): q += (" INSERT { " + graph + " {" + item + " wnp:wikipedia-page \'" + helpers.clean(source[12]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:wikipedia-page ?wp_page}})} ;") # add description if not helpers.is_bad(source[14]): q += (" INSERT { " + graph + " {" + item + " wnp:description \'" + helpers.clean(source[14]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:description ?desc}})} ;") return q