def get_continent(name): cc = get_cc(name) if cc == None: return 'Unknown' if cc in continent_map: return continent_map[cc] return transformations.cca_to_ctn(cc)
def normalizeTagAn(): continentTags = defaultdict(lambda: defaultdict(lambda: 0)) with open("tag_ana_country.txt", "r") as f: text = f.read().split("\n") for line in text: parts = line.split("\t") if len(parts) == 3 and parts[1] != "": try: continent = transformations.cca_to_ctn(parts[0]) continentTags[continent][parts[1]] += int(parts[2]) except: pass # count total totalTagOccurences = defaultdict(lambda: 0) for continent in continentTags: for tag in continentTags[continent]: totalTagOccurences[continent] += continentTags[continent][tag] #normalize normalizedContinentTags = defaultdict(lambda: defaultdict(lambda: 0)) for continent in continentTags: for tag in continentTags[continent]: normalizedContinentTags[continent][tag] = float( continentTags[continent][tag]) / float( totalTagOccurences[continent]) #write it to file with open("continentTags.txt", 'w') as g: for continent in continentTags: for tag in normalizedContinentTags[continent]: g.write( str(continent) + "\t" + str(tag) + "\t" + str(normalizedContinentTags[continent][tag]) + "\n")
def find_continent(location): if pd.notnull(location): if location in location_continent.keys(): # Check if the info are already in the location_continent dict continent = location_continent[location] print "%s -> %s (from dict)" %(location, continent) return continent else: if len(location) > 2: #Avoid splitting country codes location = re.sub(r"(\w)([A-Z])", r"\1 \2", location) #(e.g. BrevigMission => Brevig Mission) request = '' invalid = '' timeout = time.time() + 30 # Setting timeout for API request (in seconds) while not request: if time.time() > timeout: return invalid time.sleep(1) try: request = geolocalize(location.strip())[0]['address_components'][-1]['short_name'] print geolocalize(location.strip()) print dir(request) continent = transformations.cca_to_ctn(request) print "%s -> %s (from API)" %(location, continent) location_continent[location] = continent return continent except: invalid = "invalid continent for %s" %location else: return 'NaN'
def __init__(self, *args, **kwargs): super(IpLocation, self).__init__(*args, **kwargs) # preserve telize.com format self['country_code'] = self['country'] self['isp'] = self['org'] self['continent'] = transformations.cca_to_ctn(self['country_code']) self['country'] = transformations.cca_to_cn(self['country_code'])
def get_continent(host_ip): geoip = pygeoip.GeoIP(config.get('Config', 'GeoIPData')) country = geoip.country_code_by_addr(host_ip) if country == '': return 'North America' elif country == 'EU': return 'Europe' else: return transformations.cca_to_ctn(country)
def get_continent(country_short): ''' Input: string Output: string Using the transformations package this function uses a country abreiviation to find the full country name. ''' try: continent = transformations.cca_to_ctn(country_short) except: if country_short == "EU": continent = "Europe" elif country_short == "AP": continent = "Asia" else: continent = "none" return continent
def create_continents(app, schema_editor): Country = app.get_model('gmm_region', 'Country') Region = app.get_model('gmm_region', 'Region') continents = {} for country in Country.objects.all(): try: continent_name = transformations.cca_to_ctn(country.country_code) except KeyError: if country.country_code == 'SX': continent_name = 'North America' elif country.country_code == 'BQ': continent_name = 'South America' elif country.country_code == 'CW': continent_name = 'South America' elif country.country_code == 'SS': continent_name = 'Africa' finally: add_country_to_continent(country, continent_name, continents, Region) for continent_region in continents.values(): continent_region.save()
def get_pingdom_probes(url): response = feedparser.parse(url) probes = [] for item in response['items']: if item['pingdom_state'] == "Active": #Hack because Pingdom uses UK not GB for the country code alpha if item['pingdom_country']['code'] == "UK": probe = { "ip": item['pingdom_ip'], "region": "Europe" } probes.append(probe) else: probe = { "ip": item['pingdom_ip'], "region": transformations.cca_to_ctn(item['pingdom_country']['code']) } probes.append(probe) return probes
def _continent(cls, country_code): continent_code = transformations.cca_to_ctca2(country_code) continent = transformations.cca_to_ctn(country_code) return continent_code, continent
def predictLand(userList, cursor, X=[], y=[], mode="grid", continentLimit=1000): if (X == []): #somehow it seems that some user do not have steamid userIdList = [user["steamId"] for user in userList] userTagDict, userGameDict, userGameTimeDict, gameNameDict = readInGameInformation( userIdList, cursor) print("All Game Information from DB collected") # try to not have too much of the same continents continentCounter = defaultdict(lambda: 0) chosenContinents = defaultdict(lambda: 0) continentTagDict = defaultdict(lambda: defaultdict(lambda: 0)) gameCount = len(gameNameDict) X_game_times = lil_matrix((len(userList), gameCount)) currUser = 0 for user in userList: if currUser % 300 == 0: print(currUser, chosenContinents) steamId = str(user["steamId"]) if str(steamId) in userTagDict: userTagList = ' '.join(userTagDict[steamId]) userGameList = ' '.join(userGameDict[steamId]) continent = "" try: # Maybe invalid countrycode given continent = transformations.cca_to_ctn( user["loccountrycode"]) except Exception as e: options = { 'FX': 'Europe', 'YU': 'Europe', 'BQ': 'Africa', 'SS': 'Africa', 'ZR': 'Africa', 'CW': 'South America', 'SX': 'North America' } continent = options[user["loccountrycode"]] pass if continent == 'Antarctica' or continent == 'Africa' or continent == 'Oceania': continue continentCounter[continent] += 1 # for graphs for tag in userTagDict[steamId]: continentTagDict[continent][tag] += 1 if chosenContinents[continent] < continentLimit: chosenContinents[continent] += 1 counter = 0 for game in gameNameDict: if game in userGameTimeDict[steamId]: amount = userGameTimeDict[steamId][game] if (amount != 0): X_game_times[currUser, counter] = amount counter += 1 currUser += 1 X.append(userTagList + userGameList) y.append(continent) X_reshaped = lil_matrix(X_game_times[:currUser, :]) saveObject(X, "x_file") saveObject(y, "y_file") saveObject(X_reshaped, "x_game_times_file") print(chosenContinents) print("cached x, y and x_game_times") count_vect = CountVectorizer() X = count_vect.fit_transform(X) X_combined = combineLilMats(X, X_reshaped) saveObject(X_combined, "X_comb") #X_my = count_vect.fit_transform(X_my) #X_my2 = combineLilMats(X_my, X_game_times2) X = X_combined print("Chosen mode: " + mode) print("X Data in Shape: " + str(X.shape)) svd = TruncatedSVD(n_components=3) X = svd.fit_transform(X) #from mpl_toolkits.mplot3d import Axes3D #import matplotlib.pyplot as plt #fig = plt.figure() #ax = plt.axes(projection='3d') #contToNb = {'Europe': 1, 'North America': 2, 'Asia': 3, 'South America': 4} #c = [contToNb[x] for x in y ] #x_ = [i[0] for i in X] #y_ = [i[1] for i in X] #z_ = [i[2] for i in X] #ax.scatter(x_, y_, z_, c=c) #plt.show() print("X Data in Shape after TruncatedSVD: " + str(X.shape)) if mode == "grid" or mode == "rand": clfName = "AB" pipe = Pipeline([('scaler', MaxAbsScaler()), ('clf', AdaBoostClassifier())]) if mode == "grid": clf = GridSearchCV(pipe, param_grid=getParams(clfName), verbose=10, n_jobs=2) else: clf = RandomizedSearchCV(pipe, param_distributions=getParams(clfName), n_iter=20, verbose=10, n_jobs=2) classifyAndPrintResults(clf, clfName, X, y, mode=mode) elif mode == "tpot": clfWithTpot(X, y) else: classifiers = [("SVC", SVC()), ("RF", RandomForestClassifier()), ("AB", AdaBoostClassifier()), ("KNN", KNeighborsClassifier())] for clf_pair in classifiers: clfName = clf_pair[0] clf = clf_pair[1] classifyAndPrintResults(clf, clfName, X, y, mode=mode)
def migrate_repo(repo): # the various components we need to assemble opendoar = {} metadata = {} organisation = {} contacts = [] apis = [] statistics = {} register = {} software = {} policies = [] # a record of the patches to be applied to the data (mostly come from the policy data) patches = [] # original opendoar id odid = repo.get("rID") if odid is not None: opendoar["rid"] = odid # repository name _extract(repo, "rName", metadata, "name", unescape=True) # repository acronym _extract(repo, "rAcronym", metadata, "acronym", unescape=True) # repository url _extract(repo, "rUrl", metadata, "url") # oai base url oai = {"api_type" : "oai-pmh"} _extract(repo, "rOaiBaseUrl", oai, "base_url") if "base_url" in oai: apis.append(oai) # organisational details _extract(repo, "uName", organisation, "unit", unescape=True) _extract(repo, "uAcronym", organisation, "unit_acronym", unescape=True) _extract(repo, "uUrl", organisation, "unit_url") _extract(repo, "oName", organisation, "name", unescape=True) _extract(repo, "oAcronym", organisation, "acronym", unescape=True) _extract(repo, "oUrl", organisation, "url") _extract(repo, "paLatitude", organisation, "lat", cast=float) _extract(repo, "paLongitude", organisation, "lon", cast=float) cel = repo.find("country") _extract(cel, "cIsoCode", metadata, "country_code", lower=True) _extract(cel, "cIsoCode", organisation, "country_code", lower=True) isocode = cel.find("cIsoCode") if isocode is not None: code = isocode.text if code is not None and code != "": try: # specify the continent in the metadata continent_code = transformations.cca_to_ctca2(code) metadata["continent_code"] = continent_code.lower() continent = transformations.cca_to_ctn(code) metadata["continent"] = continent # normalised country name country = pycountry.countries.get(alpha2=code.upper()).name metadata["country"] = country organisation["country"] = country except KeyError: pass # repository description _extract(repo, "rDescription", metadata, "description", unescape=True) # remarks _extract(repo, "rRemarks", metadata, "description", unescape=True, append=True, prepend=" ") # statistics _extract(repo, "rNumOfItems", statistics, "value", cast=int) _extract(repo, "rDateHarvested", statistics, "date") # established date _extract(repo, "rYearEstablished", metadata, "established_date") # repository type _extract(repo, "repositoryType", metadata, "repository_type", aslist=True) # operational status _extract(repo, "operationalStatus", register, "operational_status") # software _extract(repo, "rSoftWareName", software, "name", unescape=True) _extract(repo, "rSoftWareVersion", software, "version") # subject classifications classes = repo.find("classes") if classes is not None: metadata["subject"] = [] for c in classes: subject = {} _extract(c, "clCode", subject, "code") _extract(c, "clTitle", subject, "term", unescape=True) metadata["subject"].append(subject.get("term")) # FIXME: a bit of a round trip here, but will suffice # languages langs = repo.find("languages") if langs is not None: metadata["language_code"] = [] metadata["language"] = [] for l in langs: code = l.find("lIsoCode") if code is not None and code.text != "": lc = code.text.lower() lang = pycountry.languages.get(alpha2=lc).name metadata["language_code"].append(lc) metadata["language"].append(lang) # content types ctel = repo.find("contentTypes") if ctel is not None: metadata["content_type"] = [] for ct in ctel: metadata["content_type"].append(ct.text) # policies polel = repo.find("policies") for p in polel: policy = {} _extract(p, "policyType", policy, "policy_type") posel = p.find("poStandard") if posel is not None: policy["terms"] = [] for item in posel: t = item.text.strip() # only keep terms which have mappings in the policy map mapped = policy_map.get(t) if mapped is not None: policy["terms"].append(mapped) # look for any special instructions on the term patch = instruction_map.get(t) if patch is not None: patches.append(patch) if len(policy.get("terms", [])) > 0: policies.append(policy) # contacts conel = repo.find("contacts") for contact in conel: cont_details = {} _extract(contact, "pName", cont_details, "name", unescape=True) _extract(contact, "pJobTitle", cont_details, "job_title", unescape=True) _extract(contact, "pEmail", cont_details, "email") _extract(contact, "pPhone", cont_details, "phone") has_phone = contact.find("pPhone") is not None and contact.find("pPhone").text is not None # add the top level repo data about address and phone _extract(repo, "postalAddress", cont_details, "address", unescape=True) if not has_phone: _extract(repo, "paPhone", cont_details, "phone") _extract(repo, "paFax", cont_details, "fax") # we also add the top level stuff about lat/lon if organisation.get("lat") is not None: cont_details["lat"] = organisation.get("lat") if organisation.get("lon") is not None: cont_details["lon"] = organisation.get("lon") # record the job title as the contact role for the time being full_record = {"details" : cont_details} _extract(contact, "pJobTitle", full_record, "role", unescape=True, aslist=True) contacts.append(full_record) # now assemble the object register["metadata"] = [ { "lang" : "en", "default" : True, "record" : metadata } ] if len(software.keys()) > 0: register["software"] = [software] if len(contacts) > 0: register["contact"] = contacts if len(organisation.keys()) > 0: register["organisation"] = [{"details" : organisation, "role" : ["host"]}] # add a default role if len(policies) > 0: register["policy"] = policies if len(apis) > 0: register["api"] = apis # final few opendoar admin values opendoar["in_opendoar"] = True opendoar["last_saved"] = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") record = { "register" : register, "admin" : { "opendoar" : opendoar } } statistics["third_party"] = "opendoar" statistics["type"] = "item_count" # apply any additional field patches for patch in patches: segments = patch.split("||") for s in segments: parts = s.split(":", 1) field = parts[0] try: value = json.loads(parts[1]) except ValueError: value = parts[1] stack = field.split(".") _apply(record, stack, 0, value) return record, [statistics]
def migrate_repo(repo): # the various components we need to assemble opendoar = {} metadata = {} organisation = {} contacts = [] apis = [] statistics = {} register = {} software = {} policies = [] # original opendoar id odid = repo.get("rID") if odid is not None: opendoar["rid"] = odid # repository name _extract(repo, "rName", metadata, "name", unescape=True) # repository acronym _extract(repo, "rAcronym", metadata, "acronym", unescape=True) # repository url _extract(repo, "rUrl", metadata, "url") # oai base url oai = {"api_type" : "oai-pmh"} _extract(repo, "rOaiBaseUrl", oai, "base_url") if "base_url" in oai: apis.append(oai) # organisational details _extract(repo, "uName", organisation, "unit", unescape=True) _extract(repo, "uAcronym", organisation, "unit_acronym", unescape=True) _extract(repo, "uUrl", organisation, "unit_url") _extract(repo, "oName", organisation, "name", unescape=True) _extract(repo, "oAcronym", organisation, "acronym", unescape=True) _extract(repo, "oUrl", organisation, "url") _extract(repo, "paLatitude", organisation, "lat", cast=float) _extract(repo, "paLongitude", organisation, "lon", cast=float) cel = repo.find("country") _extract(cel, "cIsoCode", metadata, "country_code", lower=True) _extract(cel, "cIsoCode", organisation, "country_code", lower=True) isocode = cel.find("cIsoCode") if isocode is not None: code = isocode.text if code is not None and code != "": try: # specify the continent in the metadata continent_code = transformations.cca_to_ctca2(code) metadata["continent_code"] = continent_code.lower() continent = transformations.cca_to_ctn(code) metadata["continent"] = continent # normalised country name country = pycountry.countries.get(alpha2=code.upper()).name metadata["country"] = country organisation["country"] = country except KeyError: pass # repository description _extract(repo, "rDescription", metadata, "description", unescape=True) # remarks _extract(repo, "rRemarks", metadata, "description", unescape=True, append=True, prepend=" ") # statistics _extract(repo, "rNumOfItems", statistics, "value", cast=int) _extract(repo, "rDateHarvested", statistics, "date") # established date _extract(repo, "rYearEstablished", metadata, "established_date") # repository type _extract(repo, "repositoryType", metadata, "repository_type", aslist=True) # operational status _extract(repo, "operationalStatus", register, "operational_status") # software _extract(repo, "rSoftWareName", software, "name", unescape=True) _extract(repo, "rSoftWareVersion", software, "version") # subject classifications classes = repo.find("classes") if classes is not None: metadata["subject"] = [] for c in classes: subject = {} _extract(c, "clCode", subject, "code") _extract(c, "clTitle", subject, "term", unescape=True) metadata["subject"].append(subject) # languages langs = repo.find("languages") if langs is not None: metadata["language_code"] = [] metadata["language"] = [] for l in langs: code = l.find("lIsoCode") if code is not None and code.text != "": lc = code.text.lower() lang = pycountry.languages.get(alpha2=lc).name metadata["language_code"].append(lc) metadata["language"].append(lang) # content types ctel = repo.find("contentTypes") if ctel is not None: metadata["content_type"] = [] for ct in ctel: metadata["content_type"].append(ct.text) # policies polel = repo.find("policies") for p in polel: policy = {} _extract(p, "policyType", policy, "policy_type") _extract(p, "policyGrade", policy, "policy_grade") posel = p.find("poStandard") if posel is not None: policy["terms"] = [] for item in posel: policy["terms"].append(item.text) policies.append(policy) # contacts conel = repo.find("contacts") for contact in conel: cont_details = {} _extract(contact, "pName", cont_details, "name", unescape=True) _extract(contact, "pJobTitle", cont_details, "job_title", unescape=True) _extract(contact, "pEmail", cont_details, "email") _extract(contact, "pPhone", cont_details, "phone") has_phone = contact.find("pPhone") is not None and contact.find("pPhone").text is not None # add the top level repo data about address and phone _extract(repo, "postalAddress", cont_details, "address", unescape=True) if not has_phone: _extract(repo, "paPhone", cont_details, "phone") _extract(repo, "paFax", cont_details, "fax") # we also add the top level stuff about lat/lon if organisation.get("lat") is not None: cont_details["lat"] = organisation.get("lat") if organisation.get("lon") is not None: cont_details["lon"] = organisation.get("lon") # record the job title as the contact role for the time being full_record = {"details" : cont_details} _extract(contact, "pJobTitle", full_record, "role", unescape=True, aslist=True) contacts.append(full_record) # now assemble the object register["metadata"] = [ { "lang" : "en", "default" : True, "record" : metadata } ] register["software"] = [software] register["contact"] = contacts register["organisation"] = [{"details" : organisation, "role" : ["host"]}] # add a default role register["policy"] = policies register["api"] = apis opendoar["in_opendoar"] = True record = { "register" : register, "admin" : { "opendoar" : opendoar } } statistics["third_party"] = "opendoar" statistics["type"] = "item_count" return record, [statistics]
def detect(self, register, info): code = register.country_code continent_code = transformations.cca_to_ctca2(code) continent = transformations.cca_to_ctn(code) register.set_continent(name=continent, code=continent_code) log.info("Determined continent from country: " + code + " -> " + continent)