def download_crawler_data(date): """ Download raw data for a day. @param Date : Formatted date AAAA_MM_JJ, such as 2015_01_03 (str). """ full_url = CRAWLER_URL + date + ".tar.gz" print("downloading : " + full_url) r = make_http_request(full_url) if not os.path.isdir(DEFAULT_DATA_DIRECTORY): os.makedirs(DEFAULT_DATA_DIRECTORY) with open(DEFAULT_DATA_DIRECTORY + date + ".tar.gz", 'wb') as f: f.write(r.content) # Untar and extract in the same directory. tar = tarfile.open(DEFAULT_DATA_DIRECTORY + date + ".tar.gz", "r:gz") tar.extractall(DEFAULT_DATA_DIRECTORY) tar.close() # Move the directory. try: shutil.move( os.path.join(DEFAULT_DATA_DIRECTORY, DEFAULT_LOCATION + date), DEFAULT_DATA_DIRECTORY) except shutil.Error: print("Error, maybe the directory " + date + " already exists") os.remove(DEFAULT_DATA_DIRECTORY + date + ".tar.gz") # works but not useful : # shutil.rmtree(os.path.join(DEFAULT_DATA_DIRECTORY,"srv")) print("Success with : " + date)
def download_crawler_data(date): """ Download raw data for a day. Date : AAAA_MM_JJ, such as 2015_01_03 """ full_url=CRAWLER_URL+date+".tar.gz" print("downloading : "+full_url) r = make_http_request(full_url)#requests.get(full_url,proxies=PROXY) #print(len(r.content)) if not os.path.isdir(DEFAULT_DATA_DIRECTORY): os.makedirs(DEFAULT_DATA_DIRECTORY) with open(DEFAULT_DATA_DIRECTORY+date+".tar.gz",'wb') as f: #f.write(r.content) f.write(r.content) #untar and extract in the same directory tar=tarfile.open(DEFAULT_DATA_DIRECTORY+date+".tar.gz","r:gz") tar.extractall(DEFAULT_DATA_DIRECTORY) tar.close() #move the directory try: shutil.move(os.path.join(DEFAULT_DATA_DIRECTORY,DEFAULT_LOCATION+date),DEFAULT_DATA_DIRECTORY) except shutil.Error: print("Error, maybe the directory "+date+" already exists") os.remove(DEFAULT_DATA_DIRECTORY+date+".tar.gz") #works but not useful #shutil.rmtree(os.path.join(DEFAULT_DATA_DIRECTORY,"srv")) print("Success with : "+date)
def search_source(source_uri = '/s/wordnet/3.0'): ''' Returns the 50 statements submitted by this source, in raw json data. :param source_uri: a uri specifying the source, e.g. '/s/contributor/omcs/rspeer', '/s/wordnet/3.0', '/s/rule/sum_edges' etc. ''' #enc_query_args = urllib.parse.urlencode(query_args) url = ''.join(['%s%s' % (settings.BASE_LOOKUP_URL, source_uri)]) json_data = make_http_request(url) return json_data
def get_similarity(concept1='dog',concept2='dog'): """ Returns a similarity score between two concepts. """ query_args={"filter" : '/c/'+settings.LANGUAGE+"/"+concept2} enc_query_args = urllib.parse.urlencode(query_args) url = ''.join(['%s/c/%s/%s?' % (settings.BASE_ASSOCIATION_URL, settings.LANGUAGE,concept1)]) + enc_query_args json_data = make_http_request(url) parsed=parse_similar_concepts(json_data) if parsed: return parsed[0][1] else: return 0
def get_similar_concepts(concept='dog',filter='/c/en/',limit=10,**kwargs): """ Returns a list of similar concepts with a score. """ query_args={"filter" : filter, "limit" : limit} for key, value in kwargs.items(): if key in settings.SUPPORTED_ASSOCIATION_ARGS: query_args[key] = value else: raise Exception("Association argument '"+key+"' incorrect.") enc_query_args = urllib.parse.urlencode(query_args) url = ''.join(['%s/c/%s/%s?' % (settings.BASE_ASSOCIATION_URL, settings.LANGUAGE,concept)]) + enc_query_args json_data = make_http_request(url) return parse_similar_concepts(json_data)
def search_edges(filter='/c/en/',limit=10,**kwargs): """ :rtype: Edge list """ query_args={"filter" : filter, "limit" : limit} #query_args = {} for key, value in kwargs.items(): if key in settings.SUPPORTED_SEARCH_ARGS: query_args[key] = value else: raise Exception("Search argument '"+key+"' incorrect.") enc_query_args = urllib.parse.urlencode(query_args) url = ''.join(['%s%s' % (settings.BASE_SEARCH_URL, '?')]) + enc_query_args json_data = make_http_request(url) return parse_relevant_edges(json_data)
def search_concept(concept,limit=1,**kwargs): ''' Returns a list of edges :param concept: a concept word or phrase, e.g. 'toast', 'see movie' etc. ''' query_args = {"limit" : limit} for key, value in kwargs.items(): if is_arg_valid(key, settings.SUPPORTED_LOOKUP_ARGS): query_args[key] = value else: raise Exception("LookUp argument '"+key+"' incorrect.") enc_query_args = urllib.parse.urlencode(query_args) concept = concept.replace(' ', '_') url = ''.join(['%s/c/%s/%s?' % (settings.BASE_LOOKUP_URL, settings.LANGUAGE, concept)]) + enc_query_args json_data = make_http_request(url).json() return parse_relevant_edges(json_data)
def get_similarity(concept1='dog', concept2='dog'): """ Performs an association query and gets a similarity score between two concepts. @param concept1 First concept. @param concept2 Second concept. @return A similarity score (float). """ query_args = {"filter": '/c/' + settings.LANGUAGE + "/" + concept2} enc_query_args = urllib.parse.urlencode(query_args) url = ''.join([ '%s/c/%s/%s?' % (settings.BASE_ASSOCIATION_URL, settings.LANGUAGE, concept1) ]) + enc_query_args json_data = make_http_request(url) parsed = parse_similar_concepts(json_data) if parsed: return parsed[0][1] else: return 0
def search_edges(filter='/c/en/', limit=10, **kwargs): """ Performs a search query and parses the result. @see settings.SUPPORTED_SEARCH_ARGS @param filter Filter. @param limit Maximum number of results. @param kwargs Other supported search arguments. @return A list of result.Edge objects. """ query_args = {"filter": filter, "limit": limit} for key, value in kwargs.items(): if key in settings.SUPPORTED_SEARCH_ARGS: query_args[key] = value else: raise Exception("Search argument '" + key + "' incorrect.") enc_query_args = urllib.parse.urlencode(query_args) url = ''.join(['%s%s' % (settings.BASE_SEARCH_URL, '?')]) + enc_query_args json_data = make_http_request(url) return parse_relevant_edges(json_data)
def get_similar_concepts_by_term_list(term_list, filter='/c/en/', limit=10, **kwargs): """ Returns concepts similar to the list. Example : http://conceptnet5.media.mit.edu/data/5.3/assoc/list/en/wayne_rooney,sport """ terms = ','.join(term_list) query_args = {"filter": filter, "limit": limit} for key, value in kwargs.items(): if key in settings.SUPPORTED_ASSOCIATION_ARGS: query_args[key] = value else: raise Exception("Association argument '" + key + "' incorrect.") enc_query_args = urllib.parse.urlencode(query_args) url = ''.join([ '%s/list/%s/%s?' % (settings.BASE_ASSOCIATION_URL, settings.LANGUAGE, terms) ]) + enc_query_args json_data = make_http_request(url) return parse_similar_concepts(json_data)
def get_similar_concepts(concept='dog', filter='/c/en/', limit=10, **kwargs): """ Performs an association query and parses the result. @see settings.SUPPORTED_ASSOCIATION_ARGS @param concept Word or phrase. @param filter Filter. @param limit Maximum number of results. @param kwargs Other supported association arguments. @return A list of [concept,similarity]. """ query_args = {"filter": filter, "limit": limit} for key, value in kwargs.items(): if key in settings.SUPPORTED_ASSOCIATION_ARGS: query_args[key] = value else: raise Exception("Association argument '" + key + "' incorrect.") enc_query_args = urllib.parse.urlencode(query_args) url = ''.join([ '%s/c/%s/%s?' % (settings.BASE_ASSOCIATION_URL, settings.LANGUAGE, concept) ]) + enc_query_args json_data = make_http_request(url) return parse_similar_concepts(json_data)
def search_concept(concept, limit=1, **kwargs): ''' Performs a lookup query and parses the result into edges objects. @see result.py @see settings.SUPPORTED_LOOKUP_ARGS @param concept A concept, word or phrase, e.g. 'toast', 'see movie' etc. @param limit The number of results needed. @param kwargs Other supported lookup arguments. @return A list of result.Edge objects. ''' query_args = {"limit": limit} for key, value in kwargs.items(): if key in settings.SUPPORTED_LOOKUP_ARGS: query_args[key] = value else: raise Exception("LookUp argument '" + key + "' incorrect.") enc_query_args = urllib.parse.urlencode(query_args) concept = concept.replace(' ', '_') url = ''.join([ '%s/c/%s/%s?' % (settings.BASE_LOOKUP_URL, settings.LANGUAGE, concept) ]) + enc_query_args json_data = make_http_request(url).json() return parse_relevant_edges(json_data)