示例#1
0
def store_as_csv(testing_article_indexes, testing_article_categories, output_file_name=OUTPUT_CSV_FILE):
    start_time = datetime.utcnow()
    log("Storing prediction results as csv in: {}".format(output_file_name))
    with open(output_file_name, 'wb') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(['id', 'specialCoverage'])
        for index, category in zip(testing_article_indexes, testing_article_categories):
            spamwriter.writerow([index, category])
    log("Done storing prediction csv in {}s.".format(seconds_since(start_time)))
示例#2
0
 def dump(self, export_file_name):
     start_time = datetime.utcnow()
     dump = {
         "index_to_family": self.index_to_family,
         "word_to_family_index": self.word_to_family_index,
         "ignored_words": list(self.ignored_words)
     }
     try:
         with open(export_file_name, mode='w') as output_file:
             json.dump(dump, output_file, cls=ComplexObjectSerializer)
         log("Done export of {} families and {} ignored words in {}s.".format(len(self.index_to_family.keys()),
                                                                              len(self.ignored_words),
                                                                              seconds_since(start_time)))
     except Exception as e:
         log("Could not export data to JSON file: {}. Reason: {}".format(export_file_name, e))
示例#3
0
    def load(self, export_file_name):
        start_time = datetime.utcnow()
        try:
            with open(export_file_name, mode='r') as input_file:
                dump = json.load(input_file)
            self.ignored_words = set(dump.get("ignored_words") or [])
            self.word_to_family_index = dump.get("word_to_family_index") or {}
            index_to_family_dict = dump.get("index_to_family") or {}
            for index, family_dict in index_to_family_dict.iteritems():
                self.index_to_family.update({int(index): WordFamily(part_of_speech=family_dict.get(u"part_of_speech"),
                                                                    synonyms=set(family_dict.get(u"synonyms")))})

            log("Done import of {} families and {} ignored words in {}s.".format(len(self.index_to_family.keys()),
                                                                                 len(self.ignored_words),
                                                                                 seconds_since(start_time)))
        except Exception as e:
            log("Could not import data from JSON file: {}. Reason: {}".format(export_file_name, e))
示例#4
0
def find_and_store_similar_families(word_cache,
                                    filename=SIMILARITIES_FILE_NAME):
    SIMILARITY_THRESHOLD = 0.3

    changes = []
    start_time = datetime.utcnow()
    for outer_index, outer_family in word_cache.index_to_family.iteritems():
        for inner_index, inner_family in word_cache.index_to_family.iteritems(
        ):
            if outer_index < inner_index:
                ratio = similarity(outer_family, inner_family)
                if ratio > SIMILARITY_THRESHOLD:
                    log("similarity: {} between (#{}) {} and (#{}) {}".format(
                        round(ratio, 2), inner_index, inner_family.synonyms,
                        outer_index, outer_family.synonyms))
                    changes.append((outer_index, inner_index))
    with open(filename, mode='w') as output_file:
        json.dump(changes, output_file)
    log("Found {} similarities in {}s!".format(len(changes),
                                               seconds_since(start_time)))
    return
示例#5
0
def load_articles(file_name):
    """
    :type file_name: str
    :rtype: list[model.article.Article | model.article.TrainingArticle]
    """
    start_time = datetime.utcnow()
    articles = []

    with open(file_name) as training_file:
        training_file_dicts = json.load(training_file,
                                        encoding=SOURCE_ENCODING)

    for training_file_dict in training_file_dicts:
        articles.append(article_to_model(training_file_dict))

    articles = filter_nones(articles)
    log("Loaded {} dicts, translated to {} models ({}%) within {} seconds".
        format(len(training_file_dicts), len(articles),
               percentage(len(articles), len(training_file_dicts)),
               seconds_since(start_time)))
    return articles
示例#6
0
def load_features(features_file_name=FEATURES_DUMP_FILE_NAME):
    """
    :type features_file_name: str
    :rtype: list[model.ArticleFeatures.ArticleFeatures]
    """
    start_time = datetime.utcnow()
    try:
        with open(features_file_name, mode='r') as input_file:
            features_dump = json.load(input_file)
        output_features = []
        for feature in features_dump:
            article_id = int(feature.get("article_id"))
            word_family_index_to_occurences = {}
            for family_index, occurences in feature.get("word_family_index_to_occurences").iteritems():
                if family_index is not None and occurences is not None:
                    word_family_index_to_occurences.update({int(family_index): int(occurences)})
            output_features.append(ArticleFeatures(article_id=article_id,
                                                   word_family_index_to_occurences=word_family_index_to_occurences))
        log("Done import of {} article's features in {}s.".format(len(output_features), seconds_since(start_time)))
        return output_features
    except Exception as e:
        log("Could not import features from JSON file: {}. Reason: {}".format(features_file_name, e))
        return []
示例#7
0
def store_features(article_features, features_file_name=FEATURES_DUMP_FILE_NAME):
    """
    :type article_features: list[model.ArticleFeatures.ArticleFeatures]
    :type features_file_name: str
    """
    start_time = datetime.utcnow()
    try:
        with open(features_file_name, mode='w') as output_file:
            json.dump(article_features, output_file, cls=ComplexObjectSerializer)
        log("Done export of {} article's features {}s.".format(len(article_features), seconds_since(start_time)))
    except Exception as e:
        log("Could not export article features data to JSON file: {}. Reason: {}".format(features_file_name, e))
示例#8
0
for index, article in enumerate(articles):
    start_time = datetime.utcnow()
    log("Started parsing article #{}/{}: {}...".format(index, len(articles),
                                                       article))
    if article.id not in parsed_features:
        article_feature = ArticleFeatures(article.id)
        linked_text_to_translate = strip_tags(article.headline + " " +
                                              article.text)
        linked_text_to_translate = filter_words_shorter_than(
            linked_text_to_translate)
        all_words = process_article_to_words(
            " ".join(linked_text_to_translate))
        log("Article #{}: {}\nall words ({}): {}...".format(
            index, article, len(all_words), crop_list_to_max(all_words)))
        for word in all_words:
            word_family_index = word_cache.add_word(word)
            if word_family_index is not None:
                article_feature.add_occurence(word_family_index)
        log("Ended parsing #{} article features: {}\narticle analyzed in: {}\n\n"
            .format(index, article_feature, seconds_since(start_time)))
        word_cache.dump(CACHE_DUMP_FILE)
        parsed_features.append(article_feature.article_id)
        article_features.append(article_feature)
        store_features(article_features=article_features,
                       features_file_name=TESTING_SET_FEATURES_DUMP_FILE_NAME)
    else:
        log("Omitting parsing article #{}: {} - already parsed!".format(
            index, article))

log("Done!")