else: url_location_dictionary[url]= tld.replace(".",'').upper() except KeyError: print "no entry found for: "+ str(tld) return url_location_dictionary if __name__ == '__main__': # generate help text for arguments parser = argparse.ArgumentParser(description='Extracts locations (country ISO code) from the top-level domains of URLs given a JSON file containing Wikipedia articles and URLs referenced by them') parser.add_argument('input', help='a file path to the input JSON file') parser.add_argument("--output", dest="output", metavar='output path', type=str, required=True) parser.add_argument("--world_fact_book_database", dest="world_fact_book_database", metavar='path to world fact book database', type=str, required=True) parser.add_argument("--IANA_database", dest="iana_database", metavar='path to IANA database', type=str, required=True) args = parser.parse_args() inputfile_path=args.input outputfile_path=args.output wfbdatabase_path=args.world_fact_book_database ianadatabase_path=args.iana_database print "running tld_location_extraction" # load json input with open(inputfile_path) as json_input: json_data = json.load(json_input) tld_location_extraction = TLDLocationExtraction(ianadatabase_path,wfbdatabase_path) url_location_dictionary = tld_location_extraction.get_tld_locations(json_data) json_writer.write_json_file(url_location_dictionary, outputfile_path)
continue except Exception as exception: print "Continue after " + exception.__class__.__name__ + " for URL: " + url continue return url_location_dictionary if __name__ == '__main__': # generate help text for arguments parser = argparse.ArgumentParser(description='Extracts locations (country ISO code) from the IP-address of URLs given a JSON file containing Wikipedia articles and URLs referenced by them') parser.add_argument('input', help='a file path to the input JSON file') parser.add_argument("--output", dest="output", metavar='output path', type=str, required=True) parser.add_argument("--database", dest="database", metavar='path to mmdb country database', type=str, required=True) args = parser.parse_args() inputfile_path=args.input outputfile_path=args.output databse_path=args.database print "running ip_location_extraction" # load json input with open(inputfile_path) as json_input: json_data = json.load(json_input) ip_location_extraction=IPLocationExtraction(databse_path) url_location_dictionary=ip_location_extraction.get_ip_locations(json_data) json_writer.write_json_file(url_location_dictionary, outputfile_path)
article_count += 1 # print article_count if __name__ == '__main__': # generate help text for arguments parser = argparse.ArgumentParser(description='Extracts geo locations from a list of wikipedia articles given in JSON.') parser.add_argument('input', help='a file path to a JSON file containing wikipedia article names') parser.add_argument("--output", dest="output", metavar='output path', type=str) parser.add_argument("--language", dest="language", metavar='two-letter country code', type=str, help="on of the language editions of dbpedia (default: en):", required=True) parser.add_argument("--threshold", dest="threshold", metavar='threshold for majority voting', type=float, help="absolute threshold for majority voting on coordinates (default: 0.1)", required=True) args = parser.parse_args() inputfile_path=args.input outputfile_path=args.output language = args.language threshold = args.threshold # load json input with open(inputfile_path) as json_input: json_data = json.load(json_input) print "running wikipedia_location_extraction" wikipedia_location_extraction = WikipediaLocationExtraction(language) article_url_dictionary = wikipedia_location_extraction.get_wikipedia_languages(json_data) json_writer.write_json_file(article_url_dictionary, outputfile_path)
metavar='two-letter country code', type=str, help="on of the language editions of dbpedia (default: en):", required=True) parser.add_argument( "--threshold", dest="threshold", metavar='threshold for majority voting', type=float, help= "absolute threshold for majority voting on coordinates (default: 0.1)", required=True) args = parser.parse_args() inputfile_path = args.input outputfile_path = args.output language = args.language threshold = args.threshold # load json input with open(inputfile_path) as json_input: json_data = json.load(json_input) print "running wikipedia_location_extraction" wikipedia_location_extraction = WikipediaLocationExtraction(language) article_url_dictionary = wikipedia_location_extraction.get_wikipedia_languages( json_data) json_writer.write_json_file(article_url_dictionary, outputfile_path)
# generate help text for arguments parser = argparse.ArgumentParser(description='Extracts URLs from a given wikipedia url and calls the feature collection functions for the urls') parser.add_argument('url', help='a Wikipedia URL for which the features are calculated') parser.add_argument("--geodatabase", dest="geodatabase", help='path to mmdb country database', type=str, required=True) parser.add_argument("--world_fact_book_database", dest="world_fact_book_database", help='path to world fact book database', type=str, required=True) parser.add_argument("--IANA_database", dest="iana_database", help='path to IANA database', type=str, required=True) parser.add_argument("--model-data", dest="model_data", metavar='path to model-data-directory', type=str, required=True) parser.add_argument("--output", dest="output", help='output folder', type=str, required=True) args = parser.parse_args() geodatabase_path=args.geodatabase wfbdatabase_path=args.world_fact_book_database ianadatabase_path=args.iana_database model_data_path = args.model_data outputfile_path=args.output url = args.url languages = ["de", "en","es","fr","general","it","nl","sv","uk"] article_extraction = ArticleExtraction(geodatabase_path,ianadatabase_path,wfbdatabase_path,model_data_path,languages) language,title = article_extraction.parse_url(url) collected_features = article_extraction.collect_features(url) #collected_features_with_prediction = article_extraction.add_predictions(language,collected_features) #json_writer.write_json_file(collected_features_with_prediction, outputfile_path+"/"+language+"-"+title+".json") json_writer.write_json_file(collected_features, outputfile_path+"/"+language+"-"+title+".json")
# It's safe to call clear() here because no descendants will be # accessed elem.clear() # Also eliminate now-empty references from the root node to elem for ancestor in elem.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None: del ancestor.getparent()[0] del context print "number of articles: " + str(article_count) print "number of undetected articles: " + str(article_not_detected_count) if __name__ == '__main__': # generate help text for arguments parser = argparse.ArgumentParser(description='Extracts languages from a list of wikipedia articles given in the xml dump format.') parser.add_argument('input', help='a file path to bz2 compressed XML dump input') parser.add_argument("--output", dest="output", metavar='output path', type=str, required=True) args = parser.parse_args() inputfile_path = args.input outputfile_path = args.output print "running wikipedia_language_extraction" wikipedia_language_extraction = WikipediaLanguageExtraction() wikipedia_language_dictionary = wikipedia_language_extraction.get_wikipedia_languages() json_writer.write_json_file(wikipedia_language_dictionary, outputfile_path)
return url_language_dictionary def timeout_handler(self,signum, frame): # Custom signal handler raise TimeoutException # Change the behavior of SIGALRM signal.signal(signal.SIGALRM, timeout_handler) if __name__ == '__main__': # generate help text for arguments parser = argparse.ArgumentParser(description='Extracts languages from the content of URLs given in a JSON file that contains Wikipedia articles and their referenced URLs') parser.add_argument('input', help='a file path to the JSON input file') parser.add_argument("--output", dest="output", metavar='output path', type=str, required=True) args = parser.parse_args() inputfile_path=args.input outputfile_path=args.output print "running website_language_extraction" # load json input with open(inputfile_path) as json_input: json_data = json.load(json_input) website_language_extraction = WebsiteLanguageExtraction() url_language_dictionary = website_language_extraction.get_website_languages(json_data) json_writer.write_json_file(url_language_dictionary, outputfile_path)
required=True) parser.add_argument("--output", dest="output", help='output folder', type=str, required=True) args = parser.parse_args() geodatabase_path = args.geodatabase wfbdatabase_path = args.world_fact_book_database ianadatabase_path = args.iana_database model_data_path = args.model_data outputfile_path = args.output url = args.url languages = ["de", "en", "es", "fr", "general", "it", "nl", "sv", "uk"] article_extraction = ArticleExtraction(geodatabase_path, ianadatabase_path, wfbdatabase_path, model_data_path, languages) language, title = article_extraction.parse_url(url) collected_features = article_extraction.collect_features(url) #collected_features_with_prediction = article_extraction.add_predictions(language,collected_features) #json_writer.write_json_file(collected_features_with_prediction, outputfile_path+"/"+language+"-"+title+".json") json_writer.write_json_file( collected_features, outputfile_path + "/" + language + "-" + title + ".json")
# generate new article collected_features = article_extraction.collect_features(article_url) collected_features_with_prediction = article_extraction.add_predictions(language,collected_features) collected_features_with_fixed_outliers = article_extraction.fix_outliers(collected_features_with_prediction,"classification","classification-fixed",features) collected_features_with_fixed_outliers = article_extraction.fix_outliers(collected_features_with_fixed_outliers,"classification-general","classification-general-fixed",features) collected_features_array = article_extraction.get_as_array(collected_features_with_fixed_outliers) if len(collected_features_array) > 0: # generate directories if they don't exist if not os.path.exists(article_path): os.makedirs(article_path) if not os.path.exists(language_path): os.makedirs(language_path) json_writer.write_json_file(collected_features_array, article_analysis_path) count_features = ["ip-location","tld-location","website-language","classification-fixed","classification-general-fixed"] for count_feature in count_features: classification_general_counts = count_generation.generate_counts(collected_features_array, count_feature) classification_general_counts_array = count_generation.get_as_array(classification_general_counts, 20) article_count_path = os.path.join(article_path,"counts-"+count_feature+"-top-20.json") json_writer.write_json_file(classification_general_counts_array, article_count_path) # generate map data map_data = map_data_generation.generate_map_data_array(collected_features_array,"classification-general-fixed") article_map_data_path = os.path.join(article_path,"map-data.json") json_writer.write_json_file(map_data, article_map_data_path) # get execution date
print "number of articles: " + str(article_count) print "number of undetected articles: " + str( article_not_detected_count) if __name__ == '__main__': # generate help text for arguments parser = argparse.ArgumentParser( description= 'Extracts languages from a list of wikipedia articles given in the xml dump format.' ) parser.add_argument('input', help='a file path to bz2 compressed XML dump input') parser.add_argument("--output", dest="output", metavar='output path', type=str, required=True) args = parser.parse_args() inputfile_path = args.input outputfile_path = args.output print "running wikipedia_language_extraction" wikipedia_language_extraction = WikipediaLanguageExtraction() wikipedia_language_dictionary = wikipedia_language_extraction.get_wikipedia_languages( ) json_writer.write_json_file(wikipedia_language_dictionary, outputfile_path)
signal.signal(signal.SIGALRM, timeout_handler) if __name__ == '__main__': # generate help text for arguments parser = argparse.ArgumentParser( description= 'Extracts languages from the content of URLs given in a JSON file that contains Wikipedia articles and their referenced URLs' ) parser.add_argument('input', help='a file path to the JSON input file') parser.add_argument("--output", dest="output", metavar='output path', type=str, required=True) args = parser.parse_args() inputfile_path = args.input outputfile_path = args.output print "running website_language_extraction" # load json input with open(inputfile_path) as json_input: json_data = json.load(json_input) website_language_extraction = WebsiteLanguageExtraction() url_language_dictionary = website_language_extraction.get_website_languages( json_data) json_writer.write_json_file(url_language_dictionary, outputfile_path)