예제 #1
0
def consolidate(infile="data/all_raw_cleaned.csv",
                outfile="data/all_metadata.csv"):
    good = read_meta()
    total = 0
    with open(infile, "r", errors="ignore") as inf, open(outfile, 'w') as outf:
        reader = csv.reader(inf, delimiter=",")
        w = csv.writer(outf,
                       delimiter=',',
                       quotechar='"',
                       quoting=csv.QUOTE_MINIMAL)
        for line in reader:
            total += 1
            while len(line) < 16:
                line.append("")

            url = line[1].replace("www.", "")
            if url in good:

                # add title if not there
                if not is_bad(good[url][1]) and is_bad(line[2]):
                    line[2] = good[url][1]

                # add description
                if not is_bad(good[url][2]):
                    line[14] = good[url][2]

                # add locale
                if not is_bad(good[url][3]):
                    line[15] = good[url][3]

            w.writerow(line)
            print(str(total), end="\r")
예제 #2
0
def locale_to_country(infile="data/all_metadata.csv",
                      outfile="data/all_metadata2.csv"):
    total = 0
    with open(infile, "r", errors="ignore") as inf, open(outfile, 'w') as outf:
        reader = csv.reader(inf, delimiter=",")
        w = csv.writer(outf,
                       delimiter=',',
                       quotechar='"',
                       quoting=csv.QUOTE_MINIMAL)

        for line in reader:
            total += 1

            if not is_bad(line[15]):

                country, language = parse_locale(line[15])

                if language != line[3] and not is_bad(language):
                    line[3] = language

                if country != line[0] and not is_bad(country):
                    line[0] = country

            w.writerow(line)
            print(str(total), end="\r")
예제 #3
0
def get_graph_spec(source):
    q = ''
    if helpers.is_bad(source[1]): 
        print(source[1])
        return q
    if source[1].find('.') == -1: return q
    url = '<http://' + urllib.parse.quote(source[1]) + '>'
    url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>' 
    graph = """ GRAPH """ + url 
    #url
    q += ("DELETE WHERE" + graph + """ {?item wdt:P17 ?country.}};
          INSERT DATA { """ + graph + "{" + url_item + " wdt:P17 """ + urllib.parse.quote(source[1]) + """\' }} 
          WHERE """ + match + ";" )
    #country
    if not helpers.is_bad(source[0]):
        country_code = get_country_code(source[0])
        if not helpers.is_bad(country_code):
            c = country_code
        else:
            c = helpers.clean(source[0])
        match = "{" + graph + "{ ?item wdt:P17 ?country}}"
        q += ("DELETE" + match + """
          INSERT { """ + graph + " {" + url_item + " wdt:P17 " + c + """ }} 
          WHERE """ + match + ";" )
    return q
예제 #4
0
def get_sources():
    sources = {}
    pref = '/Users/lavanyasingh/Desktop/GSC2O19internet_archive/data/cleaned'
    paths = os.listdir(pref)
    paths.remove('.DS_Store')
    for path in paths:
        total = 0
        with open('data/cleaned/' + path, 'r') as inf:
            reader = csv.reader(inf, delimiter=',')
            next(reader)
            for item in reader:
                if item[1].find('subject=') != -1:
                    None
                else:
                    total += 1
                    url = helpers.truncate(item[1])
                    item[1] = url
                    val = check_sources(list(sources.keys()), url, path)
                    if val != -1:
                        for i in range(len(item)):
                            if not helpers.is_bad(item[i]):
                                sources[val][i] = item[i]
                        if len(url) < len(val): sources[val][1] = url
                    else:
                        item += ['' for i in range(10)]
                        if (item[0] == "United States"
                                or item[0].lower() == "us"
                                or item[0].lower() == "usa"):
                            item[0] == "United States of America"
                        if path != 'sheet_cleaned.csv':
                            item[7] = get_meta(path)
                        sources.update({url: item})
                if total % 2500 == 0: print(path, total)
        print(path, total)
    return sources
예제 #5
0
def get_coords(sources):
    data = {}
    coord = ""
    for source in sources:
        if not helpers.is_bad(source[9]):
            if not helpers.is_bad(source[0]):
                coord = get_country_city_coord(source[0], source[9])
            else:
                coord = get_city_coord(source[9])
        elif (not helpers.is_bad(source[8]) and source[0]
        in ["united states", "unitedstatesofamerica", "us", "usa"]):
                coord = get_state_coord_us(source[9])
        elif not helpers.is_bad(source[0]):
            coord = get_country_coord(source[0])
        if coord != "": 
            data.update({source[1]: coord})
            
    return data
def make_cleaned():
    cleaned = []
    for url in sources:
        line = sources[url]
        if url in good:
            line.append(codes[url][1])
            line.append(meta[url][1])
            if helpers.is_bad(line[2]) or line[7].find("original") == -1:
                line[2] = meta[url][0]
            cleaned.append(line)
    return cleaned
예제 #7
0
    def make_all_data(self, infile):
        rows = {}
        with open('data/' + infile, 'r') as inf:
            reader = csv.reader(inf, delimiter=',')
            for line in reader:

                # clean URL
                url = helpers.truncate(line[1])
                line[1] = url
                if len(rows) % 5000 == 0: print(url)

                # get path ('/money' or '/index' for example)
                o = urllib.parse.urlparse(self.prep_url(line[1]))
                path = o.path

                # add metasource
                metasource = self.clean_meta(line[7])
                line[7] = [metasource]

                # check for various spellings of USA
                if (line[0] == "United States" or line[0].lower() == "us"
                        or line[0].lower() == "usa"):
                    line[0] == "United States of America"

                # extend row
                line += ['', '']

                # if unique url, add path
                if url not in rows:
                    if self.url_is_good(url):
                        if self.path_is_good(path):
                            line[13] = [path]
                        rows.update({url: line})
                else:
                    # add metasource if necessary
                    if metasource not in rows[url][7]:
                        rows[url][7].append(metasource)

                    # update any broken metadata to new value
                    for i in range(len(rows[url]) - 1):
                        if helpers.is_bad(rows[url][i]):
                            try:
                                rows[url][i] = line[i]
                            except:
                                pass

                    # add path if good
                    if self.path_is_good(path):
                        rows[url][13].append(path)

        return rows
예제 #8
0
    def process(self):
        total = 0
        size = 1000000
        random.seed()
        countries = []
        with open(self.infile, 'r') as inf, open(self.outfile, 'w') as outf:
            reader = csv.reader(inf, delimiter=',')
            w = csv.writer(outf,
                           delimiter=',',
                           quotechar='"',
                           quoting=csv.QUOTE_MINIMAL)
            w.writerow(["Country", "URL", "Title", "Size"])

            for line in reader:
                total += 1
                print(str(total), end="\r")

                # skip entries with broken titles
                if helpers.is_bad(line[2]): continue

                # skip countries we've already seen and bad countries
                if line[0] in countries or helpers.is_bad(line[0]): continue

                size = size / 1.05

                if total < 5:
                    size = size * 5

                try:
                    int(line[2].replace("Q", "")[0:3])
                    continue
                except:
                    countries.append(line[0])

                    row = [line[0], line[1], line[2], size]
                    w.writerow(row)
예제 #9
0
def get_graph_spec(info):
    url_raw, metasource = info[0], info[1]
    q = ''
    if helpers.is_bad(url_raw):
        print(url_raw)
        return q
    if url_raw.find('.') == -1: return q
    url = '<http://' + urllib.parse.quote(url_raw) + '>'
    url_item = '<http://' + urllib.parse.quote(url_raw) + '/item>'
    graph = """ GRAPH """ + url
    ms = helpers.strip_spaces(metasource)
    q = "INSERT {" + graph + "{" + url_item + "wnp:metasource wni:" + ms + """}}
    WHERE {FILTER (EXISTS {""" + graph + """{?s ?p ?o} } && 
    NOT EXISTS {""" + graph + "{ ?item wnp:metasource wni:" + ms + "}})};"
    return q
예제 #10
0
def make_all_data():
    total, uq = 0, 0
    rows = {}
    for path in paths:
        print(path)
        with open('data/raw/' + path, 'r') as inf:
            reader = csv.reader(inf, delimiter=',')
            for line in reader:
                total += 1
                url = clean_url(line[1])
                line[1] = url
                metasource = clean_meta(line[7])
                row = line
                if (row[0] == "United States" or row[0].lower() == "us"
                        or row[0].lower() == "usa"):
                    row[0] == "United States of America"
                row[7] = [metasource]
                row = [row[i] for i in range(12)] + ['', '']
                if url not in rows:
                    if url_is_good(url):
                        uq += 1
                        if len(sources[url]) < 10:
                            row[13] = sources[url]
                        else:
                            row[13] = []
                        rows.update({url: row})
                else:
                    rows[url][7].append(
                        metasource
                    ) if metasource not in rows[url][7] else rows[url][7]
                    for i in range(len(rows[url]) - 1):
                        if helpers.is_bad(rows[url][i]):
                            try:
                                rows[url][i] = row[i]
                            except:
                                print(row, i)
                                return "OOPS"
                if total % 10000 == 0 and url_is_good(url):
                    print(url, rows[url])
        print("DONE", path, total, uq)
    return rows
예제 #11
0
    def process_geo(self):
        countries = self.read_countries()
        total = 0
        with open(self.infile, 'r') as inf, open(self.outfile, 'w') as outf:
            reader = csv.reader(inf, delimiter=',')
            w = csv.writer(outf,
                           delimiter=',',
                           quotechar='"',
                           quoting=csv.QUOTE_MINIMAL)
            w.writerow([
                "ISO Code", "Latitude", "Longitude", "Country", "URL", "Title"
            ])

            for line in reader:
                total += 1
                # skip entries with broken country values
                if helpers.is_bad(line[0]): continue

                country = process.extract(line[0],
                                          list(countries.keys()),
                                          limit=1)[0][0]
                print(str(total), end="\r")
                row = countries[country] + [line[1]]
                w.writerow(row)
예제 #12
0
def get_graph_spec(source):
    q = ''
    if helpers.is_bad(source[1]): 
        print(source[1])
        return q
    if source[1].find('.') == -1: return q
    url = '<http://' + urllib.parse.quote(source[1]) + '>'
    url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>' 
    graph = """ GRAPH """ + url 
    #url
    q += ("INSERT { " + graph + " {" + url_item + " wdt:P1896 \'" + 
            urllib.parse.quote(source[1]) + """\' }}
            WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1896 ?url}})} ;" )
    #country
    if not helpers.is_bad(source[0]):
        country_code = get_country_code(source[0])
        if not helpers.is_bad(country_code):
            c = country_code
        else:
            c = helpers.clean(source[0])
        q += (" INSERT { " + graph + " {" + url_item + " wdt:P17 \'" + c + """\' }}
        WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P17 ?country}})} ;" )
    #title
    if not helpers.is_bad(source[2]):
        q += (" INSERT { " + graph + " {" + url_item + " wdt:P1448 \'" + helpers.clean(source[2]) + """\' }}
        WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1448 ?title}})} ;" )
    #language
    if not helpers.is_bad(source[3]):
        q += (" INSERT { " + graph + " {" + url_item + " wdt:P37 \'" + helpers.clean(source[3]) + """\' }}
        WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P37 ?lang}})} ;" )
    #type
    if not helpers.is_bad(source[4]):
        q += (" INSERT { " + graph + " {" + url_item + " wdt:P31 \'" + helpers.clean(source[4]) + """\' }}
        WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P31 ?type}})} ;" )
    #title (native language)
    if not helpers.is_bad(source[5]):
        q += (" INSERT { " + graph + " {" + url_item + " wdt:P1704 \'" + helpers.clean(source[5]) + """\' }}
        WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1448 ?title_native}})} ;" )
    #paywall
    if not helpers.is_bad(source[6]):
        q += (" INSERT { " + graph + " {" + url_item + " wnp:paywalled \'" + helpers.clean(source[6]) + """\' }}
        WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:paywalled ?pw}})} ;" )
    #metasource
    if not helpers.is_bad(source[7]):
        q += (" INSERT { " + graph + " {" + url_item + " wnp:metasource wni:" + 
        helpers.strip_spaces(source[7]).lower()   + """ }}
        WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:metasource ?ms}})} ;" )
    #state
    if not helpers.is_bad(source[8]):
        q += (" INSERT { " + graph + " {" + url_item + " wdt:P131 \'" + helpers.clean(source[8]) + """\' }}
        WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P131 ?state}})} ;" )
    #wikipedia name
    if not helpers.is_bad(source[10]):
        q += (" INSERT { " + graph + " {" + url_item + " wnp:wikipedia-name \'" + helpers.clean(source[10]) + """\' }}
        WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:wikipedia-name ?wp_name}})} ;" )
    #redirects?
    if not helpers.is_bad(source[11]):
        q += (" INSERT { " + graph + " {" + url_item + " wnp:redirect \'" + helpers.clean(source[11]) + """\' }}
        WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:redirect ?rd}})} ;" )
    #wikipedia link
    if not helpers.is_bad(source[12]):
        q += (" INSERT { " + graph + " {" + url_item + " wnp:wikipedia-page \'" + helpers.clean(source[12]) + """\' }}
        WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:wikipedia-page ?wp_page}})} ;" )
    return q
예제 #13
0
    def overwrite(self, source):
        q = ''

        # check for a valid url
        if helpers.is_bad(source[1]) or source[1].find('.') == -1:
            return q

        # add url to graph
        url = '<http://' + urllib.parse.quote(source[1].replace(
            "http://", "").replace("https://", "")) + '>'
        url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>'
        graph = """ GRAPH """ + url

        # add url
        match = "{" + graph + "{ ?item wdt:P1896 ?url}}"
        q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P1896 "
              "" + url + """ }} 
              WHERE {} ;""")

        # add country
        if not helpers.is_bad(source[0]):
            country_code = self.get_country_code(source[0])
            if not helpers.is_bad(country_code):
                c = country_code
            else:
                c = helpers.clean(source[0])
            match = "{" + graph + "{ ?item wdt:P17 ?country}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P17 " + c +
                  """ }} 
              WHERE {} ;""")

        # add title
        if not helpers.is_bad(source[2]):
            match = "{" + graph + "{ ?item wdt:P1448 ?title}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P1448 \'" +
                  helpers.clean_string(source[2]) + """\' }} 
             WHERE {} ;""")

        # add language
        if not helpers.is_bad(source[3]):
            match = "{" + graph + "{ ?item wdt:P37 ?lang}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P37 \'" +
                  helpers.clean_string(source[3]) + """\' }} 
              WHERE {} ;""")

        # add type
        if not helpers.is_bad(source[4]):
            match = "{" + graph + "{ ?item wdt:P31 ?type}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P31 \'" +
                  helpers.clean_string(source[4]) + """\' }} 
              WHERE {} ;""")

        # add title (native language)
        if not helpers.is_bad(source[5]):
            match = "{" + graph + "{ ?item wdt:P1704 ?title}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P1704 \'" +
                  helpers.clean_string(source[5]) + """\'}} 
              WHERE {} ;""")

        # add paywall
        if not helpers.is_bad(source[6]):
            match = "{" + graph + "{ ?item wnp:paywalled ?pw}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wnp:paywalled \'" +
                  helpers.clean_string(source[2]) + """\' }} 
              WHERE {} ;""")

        # add metasources
        for ms in source[7]:
            if not helpers.is_bad(ms):
                match = "{" + graph + "{ ?item wnp:metasource ?ms}}"
                q += ("DELETE" + match + """
                  INSERT { """ + graph + " {" + url_item +
                      " wnp:metasource wni:" +
                      helpers.strip_spaces(ms).lower() + """ }} 
                  WHERE {} ;""")

        # add state
        if not helpers.is_bad(source[8]):
            match = "{" + graph + "{ ?item wdt:P131 ?state}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P131 \'" +
                  helpers.clean_string(source[8]) + """\' }} 
              WHERE {} ;""")

        # add wikipedia name
        if not helpers.is_bad(source[10]):
            match = "{" + graph + "{ ?item wnp:wikipedia-name ?wp_name}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item +
                  " wnp:wikipedia-name \'" + helpers.clean_string(source[10]) +
                  """\' }} 
              WHERE {} ;""")

        # add redirects?
        if not helpers.is_bad(source[11]):
            match = "{" + graph + "{ ?item wnp:redirect ?rd}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wnp:redirect \'" +
                  helpers.clean_string(source[11]) + """\' }} 
              WHERE {} ;""")

        # add wikipedia link
        if not helpers.is_bad(source[12]):
            match = "{" + graph + "{ ?item wnp:wikipedia-page ?wp_page}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item +
                  " wnp:wikipedia-page \'" + helpers.clean_string(source[12]) +
                  """\' }} 
              WHERE {} ;""")

        # add description
        try:
            if not helpers.is_bad(source[14]):
                match = "{" + graph + "{ ?item wnp:description ?desc}}"
                q += ("DELETE" + match + """
                  INSERT { """ + graph + " {" + url_item +
                      " wnp:description \'" +
                      helpers.clean_string(source[14]) + """\' }} 
                  WHERE {} ;""")
        except IndexError:
            None

        return q
예제 #14
0
    def first_load(self, source):

        # checks for bad URLs
        if helpers.is_bad(source[1]) or source[1].find('.') == -1:
            return ''

        # insert URL
        url = '<http://' + urllib.parse.quote(source[1]) + '>'
        url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>'
        q = """GRAPH """ + url + """ { 
        """ + url_item + """ wdt:P1896 """ + url

        # add country
        if not helpers.is_bad(source[0]):
            country_code = self.get_country_code(source[0])
            if not helpers.is_bad(country_code):
                q += """;
                wdt:P17 """ + country_code
            else:
                q += """;
                wdt:P17 \'""" + helpers.clean_string(source[0]) + """\' """

        # add title
        if not helpers.is_bad(source[2]):
            q += """;
                wdt:P1448 \'""" + helpers.clean_string(source[2]) + """\' """

        # add language
        if not helpers.is_bad(source[3]):
            q += """;
                wdt:P37 \'""" + helpers.clean_string(source[3]) + """\' """

        #add type
        if not helpers.is_bad(source[4]):
            q += """;
                wdt:P31 \'""" + helpers.clean_string(source[4]) + """\' """

        #add title (native language)
        if not helpers.is_bad(source[5]):
            q += """;
                wdt:P1704 \'""" + helpers.clean_string(source[5]) + """\' """

        # add paywall
        if not helpers.is_bad(source[6]):
            q += """;
                wnp:paywalled \'""" + helpers.clean_string(
                source[6]) + """\' """

        # add metasources
        if not helpers.is_bad(source[7]):
            q += self.get_ms(source[7])

        # add state
        if not helpers.is_bad(source[8]):
            q += """;
                wdt:P131 \'""" + helpers.clean_string(source[8]) + """\' """

        # add town
        if not helpers.is_bad(source[9]):
            q += """;
                wdt:P131 \'""" + helpers.clean_string(source[9]) + """\' """

        # add wikipedia name
        if not helpers.is_bad(source[10]):
            q += """;
                wnp:wikipedia-name \'""" + helpers.clean_string(
                source[10]) + "\' "

        # add redirects?
        if not helpers.is_bad(source[11]):
            q += """;
                wnp:redirect \'""" + helpers.clean_string(
                source[11]) + """\' """

        # add wikipedia link
        if not helpers.is_bad(source[12]):
            q += """;
                wnp:wikipedia-page \'""" + urllib.parse.quote(
                source[12]) + """\'"""

        # add paths
        if not helpers.is_bad(source[13]):
            q += self.get_path_spec(source[13])

        # add description
        if not helpers.is_bad(source[14]):
            q += """;
                wnp:description \'""" + helpers.clean_string(source[14]) + "\'"

        q += """.}"""

        return q
예제 #15
0
    def no_overwrite(self, source):
        q = ''
        if helpers.is_bad(source[1]):
            print(source[1])
            return q
        # this means our url is not valid
        if source[1].find('.') == -1: return q

        # begin constructing graph spec
        # construct item
        item = '<http://' + urllib.parse.quote(source[1]) + '/item>'
        # construct item URL
        url = '<http://' + urllib.parse.quote(source[1]) + '>'
        # construct graph value
        graph = """ GRAPH """ + url

        # add URL
        q += ("INSERT { " + graph + " {" + item + " wdt:P1896 " + url + """ }} 
                WHERE {FILTER (NOT EXISTS {""" + graph +
              "{ ?item wdt:P1896 ?url}})} ;")

        # add country
        if not helpers.is_bad(source[0]):
            country_code = self.get_country_code(source[0])
            if not helpers.is_bad(country_code):
                c = country_code
            else:
                c = helpers.clean(source[0])
            q += (" INSERT { " + graph + " {" + item + " wdt:P17 \'" + c +
                  """\' }}
            WHERE {FILTER (NOT EXISTS {""" + graph +
                  "{ ?item wdt:P17 ?country}})} ;")

        # add title title
        if not helpers.is_bad(source[2]):
            q += (" INSERT { " + graph + " {" + item + " wdt:P1448 \'" +
                  helpers.clean(source[2]) + """\' }}
            WHERE {FILTER (NOT EXISTS {""" + graph +
                  "{ ?item wdt:P1448 ?title}})} ;")

        # add language
        if not helpers.is_bad(source[3]):
            q += (" INSERT { " + graph + " {" + item + " wdt:P37 \'" +
                  helpers.clean(source[3]) + """\' }}
            WHERE {FILTER (NOT EXISTS {""" + graph +
                  "{ ?item wdt:P37 ?lang}})} ;")

        # add source type
        if not helpers.is_bad(source[4]):
            q += (" INSERT { " + graph + " {" + item + " wdt:P31 \'" +
                  helpers.clean(source[4]) + """\' }}
            WHERE {FILTER (NOT EXISTS {""" + graph +
                  "{ ?item wdt:P31 ?type}})} ;")

        # add title in native language
        if not helpers.is_bad(source[5]):
            q += (" INSERT { " + graph + " {" + item + " wdt:P1704 \'" +
                  helpers.clean(source[5]) + """\' }}
            WHERE {FILTER (NOT EXISTS {""" + graph +
                  "{ ?item wdt:P1448 ?title_native}})} ;")

        # add paywall (Yes or No)
        if not helpers.is_bad(source[6]):
            q += (" INSERT { " + graph + " {" + item + " wnp:paywalled \'" +
                  helpers.clean(source[6]) + """\' }}
            WHERE {FILTER (NOT EXISTS {""" + graph +
                  "{ ?item wnp:paywalled ?pw}})} ;")

        # add metasource
        if not helpers.is_bad(source[7]):
            q += (" INSERT { " + graph + " {" + item + " wnp:metasource wni:" +
                  helpers.strip_spaces(source[7]).lower() + """ }}
            WHERE {FILTER (NOT EXISTS {""" + graph +
                  "{ ?item wnp:metasource ?ms}})} ;")

        # add state
        if not helpers.is_bad(source[8]):
            q += (" INSERT { " + graph + " {" + item + " wdt:P131 \'" +
                  helpers.clean(source[8]) + """\' }}
            WHERE {FILTER (NOT EXISTS {""" + graph +
                  "{ ?item wdt:P131 ?state}})} ;")

        # add wikipedia name
        if not helpers.is_bad(source[10]):
            q += (" INSERT { " + graph + " {" + item +
                  " wnp:wikipedia-name \'" + helpers.clean(source[10]) +
                  """\' }}
            WHERE {FILTER (NOT EXISTS {""" + graph +
                  "{ ?item wnp:wikipedia-name ?wp_name}})} ;")

        # add redirect
        if not helpers.is_bad(source[11]):
            q += (" INSERT { " + graph + " {" + item + " wnp:redirect \'" +
                  helpers.clean(source[11]) + """\' }}
            WHERE {FILTER (NOT EXISTS {""" + graph +
                  "{ ?item wnp:redirect ?rd}})} ;")

        # add wikipedia link
        if not helpers.is_bad(source[12]):
            q += (" INSERT { " + graph + " {" + item +
                  " wnp:wikipedia-page \'" + helpers.clean(source[12]) +
                  """\' }}
            WHERE {FILTER (NOT EXISTS {""" + graph +
                  "{ ?item wnp:wikipedia-page ?wp_page}})} ;")

        # add description
        if not helpers.is_bad(source[14]):
            q += (" INSERT { " + graph + " {" + item + " wnp:description \'" +
                  helpers.clean(source[14]) + """\' }}
            WHERE {FILTER (NOT EXISTS {""" + graph +
                  "{ ?item wnp:description ?desc}})} ;")

        return q