Пример #1
0
def createFromScrapedDefinitions():
    common_logger.info("Creating AcronymDB")
    csv.field_size_limit(sys.maxint)

    acronymDB = {}
    loaded_acronyms = 0
    for definition_file in file_scraped_definitions_list:
        # open as csv file with headers
        acronym_csv = csv.DictReader(
            open(definition_file, "rb"), delimiter=",")

        for row in acronym_csv:
            acronym = toUnicode(row["acronym"])
            acronym_expansion = toUnicode(row["acronym_expansion"])
            article_id = toUnicode(row["article_id"])
            if(acronym not in acronymDB):
                acronymDB[acronym] = []
            acronymDB[acronym].append([acronym_expansion
                                       .strip().lower().replace('-', ' '), article_id])
            # , row["article_title"]]) # title was part of old format
            loaded_acronyms += 1
            if(loaded_acronyms % 10000 == 0):
                common_logger.debug("loaded %d acronyms", loaded_acronyms)

    common_logger.info("adding def_count values to acronymDB")
    defs_per_acronym = [0] * 1000
    insts_per_def = [0] * 1000
    #num_acronyms = len(acronymDB)
    for acronym, values_for_this_acronym in acronymDB.items():
        values_for_this_acronym = sorted(
            values_for_this_acronym, key=lambda x: x[0])

        def_count = 0
        inst_count = 0
        expansion_of_last_acronym = values_for_this_acronym[0][0]
        #, article_title]\ # title was part of old format in the line below
        for index, [acronym_expansion, article_id]\
                in enumerate(values_for_this_acronym):
            if AcronymExpansion.startsSameWay(acronym_expansion, expansion_of_last_acronym):
                inst_count += 1
                values_for_this_acronym[index].append(def_count)
                values_for_this_acronym[index][0] = expansion_of_last_acronym
            else:
                insts_per_def[min(inst_count, len(insts_per_def) - 1)] += 1
                inst_count = 0
                def_count += 1
                expansion_of_last_acronym = acronym_expansion
                values_for_this_acronym[index].append(def_count)
        defs_per_acronym[min(def_count, len(defs_per_acronym) - 1)] += 1
        acronymDB[acronym] = numpy.array(values_for_this_acronym)

    dump(acronymDB)
    common_logger.info("Dumped AcronymDB successfully")