def _processChoices(self, acronym_expansions):
        """
        input: list(acronym expansion strings)
        returns:
        y_labels (list): of integer labels assigned to acronym expansions
        labelToExpansion (dict): to convert label number to acronym expansion  
        """
        y_labels = []
        labelToExpansion = {}

        if(len(acronym_expansions) == 0):
            return y_labels, labelToExpansion

        y_labels = [index for index in range(len(acronym_expansions))]
        labelToExpansion[0] = acronym_expansions[0]

        for indexAhead in range(1, len(acronym_expansions)):
            new_expansion = acronym_expansions[indexAhead]
            newIsUnique = True

            # check if new_expansion is same as a previous expansion
            # if same assign previous label and move on
            for label, expansion in labelToExpansion.items():
                if(AcronymExpansion.areExpansionsSimilar(expansion, new_expansion)):
                    newIsUnique = False
                    y_labels[indexAhead] = label
                    break
            # if label is new indeed, then give it a label ID (integer) and
            # make an entry in the labelToExpansion dictionary
            if(newIsUnique):
                new_class_label = len(labelToExpansion)
                labelToExpansion[new_class_label] = new_expansion
                y_labels[indexAhead] = new_class_label

        return y_labels, labelToExpansion
Пример #2
0
def createFromScrapedDefinitions():
    common_logger.info("Creating AcronymDB")
    csv.field_size_limit(sys.maxint)

    acronymDB = {}
    loaded_acronyms = 0
    for definition_file in file_scraped_definitions_list:
        # open as csv file with headers
        acronym_csv = csv.DictReader(
            open(definition_file, "rb"), delimiter=",")

        for row in acronym_csv:
            acronym = toUnicode(row["acronym"])
            acronym_expansion = toUnicode(row["acronym_expansion"])
            article_id = toUnicode(row["article_id"])
            if(acronym not in acronymDB):
                acronymDB[acronym] = []
            acronymDB[acronym].append([acronym_expansion
                                       .strip().lower().replace('-', ' '), article_id])
            # , row["article_title"]]) # title was part of old format
            loaded_acronyms += 1
            if(loaded_acronyms % 10000 == 0):
                common_logger.debug("loaded %d acronyms", loaded_acronyms)

    common_logger.info("adding def_count values to acronymDB")
    defs_per_acronym = [0] * 1000
    insts_per_def = [0] * 1000
    #num_acronyms = len(acronymDB)
    for acronym, values_for_this_acronym in acronymDB.items():
        values_for_this_acronym = sorted(
            values_for_this_acronym, key=lambda x: x[0])

        def_count = 0
        inst_count = 0
        expansion_of_last_acronym = values_for_this_acronym[0][0]
        #, article_title]\ # title was part of old format in the line below
        for index, [acronym_expansion, article_id]\
                in enumerate(values_for_this_acronym):
            if AcronymExpansion.startsSameWay(acronym_expansion, expansion_of_last_acronym):
                inst_count += 1
                values_for_this_acronym[index].append(def_count)
                values_for_this_acronym[index][0] = expansion_of_last_acronym
            else:
                insts_per_def[min(inst_count, len(insts_per_def) - 1)] += 1
                inst_count = 0
                def_count += 1
                expansion_of_last_acronym = acronym_expansion
                values_for_this_acronym[index].append(def_count)
        defs_per_acronym[min(def_count, len(defs_per_acronym) - 1)] += 1
        acronymDB[acronym] = numpy.array(values_for_this_acronym)

    dump(acronymDB)
    common_logger.info("Dumped AcronymDB successfully")