示例#1
0
 def __init__(self, geodatabase_path, ianadatabase_path, wfbdatabase_path,
              model_data_path, languages):
     self.wikipedia_api_url_extraction = WikipediaApiUrlExtraction()
     self.feature_collection = FeatureCollection(geodatabase_path,
                                                 ianadatabase_path,
                                                 wfbdatabase_path)
     self.instance_classification = InstanceClassification(
         model_data_path, languages)
     self.languages = languages
示例#2
0
 def __init__(self,geodatabase_path,ianadatabase_path,wfbdatabase_path,model_data_path,languages):
     self.wikipedia_api_url_extraction=WikipediaApiUrlExtraction()
     self.feature_collection = FeatureCollection(geodatabase_path,ianadatabase_path,wfbdatabase_path)
     self.instance_classification = InstanceClassification(model_data_path,languages)
     self.languages = languages
示例#3
0
class ArticleExtraction(object):

    def __init__(self,geodatabase_path,ianadatabase_path,wfbdatabase_path,model_data_path,languages):
        self.wikipedia_api_url_extraction=WikipediaApiUrlExtraction()
        self.feature_collection = FeatureCollection(geodatabase_path,ianadatabase_path,wfbdatabase_path)
        self.instance_classification = InstanceClassification(model_data_path,languages)
        self.languages = languages
    
    def parse_url(self,link):
        return self.wikipedia_api_url_extraction.parse_url(link)
    
    def collect_features(self,wikipedia_url):
        extracted_urls=self.wikipedia_api_url_extraction.extract_urls(wikipedia_url)
        collected_featues = {}
        extracted_urls_len = len(extracted_urls)
        extracted_urls_count = 0
        
        id_count = 0
        for extracted_url in extracted_urls:

            extracted_urls_count += 1
            print "extract URL " + str(extracted_urls_count)+" of "+str(extracted_urls_len)

            current_features = self.feature_collection.get_features("", extracted_url)
            current_features["url"] = extracted_url
            collected_featues[id_count] = current_features
            id_count += 1
        return collected_featues

    def add_predictions(self,language,collected_features):
        
        ids_to_be_removed = []
        for url_id in collected_features:
            observation = []
            
            if "ip-location" in collected_features[url_id]:
                observation.append(collected_features[url_id]["ip-location"])
            else:
                observation.append("NaN")

            if "tld-location" in collected_features[url_id]:
                observation.append(collected_features[url_id]["tld-location"])
            else:
                observation.append("NaN")

            if "website-language" in collected_features[url_id]:
                observation.append(collected_features[url_id]["website-language"].upper())
            else:
                observation.append("NaN")

            if language in self.languages:
                classification = self.instance_classification.classify(language, observation)
                if classification is None:
                    if url_id not in ids_to_be_removed:
                        ids_to_be_removed.append(url_id) 
                else:
                    collected_features[url_id]["classification"] = classification
                    
            classification_general = self.instance_classification.classify("general",observation)
            if classification_general is None:
                if url_id not in ids_to_be_removed:
                    ids_to_be_removed.append(url_id) 
            else:
                collected_features[url_id]["classification-general"] = classification_general

            collected_features[url_id]["wikipedia-language"] = language

        for url_to_be_removed in ids_to_be_removed:
            del collected_features[url_to_be_removed]

        return collected_features
    
    def fix_outliers(self, url_feature_dict, classification_id, fixed_classification_id, features):
        for feature_id in url_feature_dict:
            if classification_id in url_feature_dict[feature_id]:
                classification = url_feature_dict[feature_id][classification_id]
            else:
                continue

            feature_values = {}
            for feature in features:
                if feature in url_feature_dict[feature_id]:
                    feature_values[feature] = url_feature_dict[feature_id][feature]
                
            classification_in_features = False
            for feature_name in feature_values:
                if classification.lower() == feature_values[feature_name].lower():
                    classification_in_features = True

            if not classification_in_features:
                # take first element in feature list and use it
                if features[0] in feature_values:
                    url_feature_dict[feature_id][fixed_classification_id] = feature_values[features[0]]

            if fixed_classification_id not in url_feature_dict[feature_id]:
                url_feature_dict[feature_id][fixed_classification_id] = classification

        return url_feature_dict
                
    def get_as_array(self,url_feature_dict):
        url_feature_array = []
        
        for feature_id in url_feature_dict:
            dict_for_feature_id = {}

            for feature_name in url_feature_dict[feature_id]:
                dict_for_feature_id[feature_name] = url_feature_dict[feature_id][feature_name]
            
            url_feature_array.append(dict_for_feature_id)
        
        return url_feature_array
示例#4
0
class ArticleExtraction(object):
    def __init__(self, geodatabase_path, ianadatabase_path, wfbdatabase_path,
                 model_data_path, languages):
        self.wikipedia_api_url_extraction = WikipediaApiUrlExtraction()
        self.feature_collection = FeatureCollection(geodatabase_path,
                                                    ianadatabase_path,
                                                    wfbdatabase_path)
        self.instance_classification = InstanceClassification(
            model_data_path, languages)
        self.languages = languages

    def parse_url(self, link):
        return self.wikipedia_api_url_extraction.parse_url(link)

    def collect_features(self, wikipedia_url):
        extracted_urls = self.wikipedia_api_url_extraction.extract_urls(
            wikipedia_url)
        collected_featues = {}
        extracted_urls_len = len(extracted_urls)
        extracted_urls_count = 0

        id_count = 0
        for extracted_url in extracted_urls:

            extracted_urls_count += 1
            print "extract URL " + str(extracted_urls_count) + " of " + str(
                extracted_urls_len)

            current_features = self.feature_collection.get_features(
                "", extracted_url)
            current_features["url"] = extracted_url
            collected_featues[id_count] = current_features
            id_count += 1
        return collected_featues

    def add_predictions(self, language, collected_features):

        ids_to_be_removed = []
        for url_id in collected_features:
            observation = []

            if "ip-location" in collected_features[url_id]:
                observation.append(collected_features[url_id]["ip-location"])
            else:
                observation.append("NaN")

            if "tld-location" in collected_features[url_id]:
                observation.append(collected_features[url_id]["tld-location"])
            else:
                observation.append("NaN")

            if "website-language" in collected_features[url_id]:
                observation.append(
                    collected_features[url_id]["website-language"].upper())
            else:
                observation.append("NaN")

            if language in self.languages:
                classification = self.instance_classification.classify(
                    language, observation)
                if classification is None:
                    if url_id not in ids_to_be_removed:
                        ids_to_be_removed.append(url_id)
                else:
                    collected_features[url_id][
                        "classification"] = classification

            classification_general = self.instance_classification.classify(
                "general", observation)
            if classification_general is None:
                if url_id not in ids_to_be_removed:
                    ids_to_be_removed.append(url_id)
            else:
                collected_features[url_id][
                    "classification-general"] = classification_general

            collected_features[url_id]["wikipedia-language"] = language

        for url_to_be_removed in ids_to_be_removed:
            del collected_features[url_to_be_removed]

        return collected_features

    def fix_outliers(self, url_feature_dict, classification_id,
                     fixed_classification_id, features):
        for feature_id in url_feature_dict:
            if classification_id in url_feature_dict[feature_id]:
                classification = url_feature_dict[feature_id][
                    classification_id]
            else:
                continue

            feature_values = {}
            for feature in features:
                if feature in url_feature_dict[feature_id]:
                    feature_values[feature] = url_feature_dict[feature_id][
                        feature]

            classification_in_features = False
            for feature_name in feature_values:
                if classification.lower(
                ) == feature_values[feature_name].lower():
                    classification_in_features = True

            if not classification_in_features:
                # take first element in feature list and use it
                if features[0] in feature_values:
                    url_feature_dict[feature_id][
                        fixed_classification_id] = feature_values[features[0]]

            if fixed_classification_id not in url_feature_dict[feature_id]:
                url_feature_dict[feature_id][
                    fixed_classification_id] = classification

        return url_feature_dict

    def get_as_array(self, url_feature_dict):
        url_feature_array = []

        for feature_id in url_feature_dict:
            dict_for_feature_id = {}

            for feature_name in url_feature_dict[feature_id]:
                dict_for_feature_id[feature_name] = url_feature_dict[
                    feature_id][feature_name]

            url_feature_array.append(dict_for_feature_id)

        return url_feature_array