def createFromScrapedDefinitions(): common_logger.info("Creating AcronymDB") csv.field_size_limit(sys.maxint) acronymDB = {} loaded_acronyms = 0 for definition_file in file_scraped_definitions_list: # open as csv file with headers acronym_csv = csv.DictReader( open(definition_file, "rb"), delimiter=",") for row in acronym_csv: acronym = toUnicode(row["acronym"]) acronym_expansion = toUnicode(row["acronym_expansion"]) article_id = toUnicode(row["article_id"]) if(acronym not in acronymDB): acronymDB[acronym] = [] acronymDB[acronym].append([acronym_expansion .strip().lower().replace('-', ' '), article_id]) # , row["article_title"]]) # title was part of old format loaded_acronyms += 1 if(loaded_acronyms % 10000 == 0): common_logger.debug("loaded %d acronyms", loaded_acronyms) common_logger.info("adding def_count values to acronymDB") defs_per_acronym = [0] * 1000 insts_per_def = [0] * 1000 #num_acronyms = len(acronymDB) for acronym, values_for_this_acronym in acronymDB.items(): values_for_this_acronym = sorted( values_for_this_acronym, key=lambda x: x[0]) def_count = 0 inst_count = 0 expansion_of_last_acronym = values_for_this_acronym[0][0] #, article_title]\ # title was part of old format in the line below for index, [acronym_expansion, article_id]\ in enumerate(values_for_this_acronym): if AcronymExpansion.startsSameWay(acronym_expansion, expansion_of_last_acronym): inst_count += 1 values_for_this_acronym[index].append(def_count) values_for_this_acronym[index][0] = expansion_of_last_acronym else: insts_per_def[min(inst_count, len(insts_per_def) - 1)] += 1 inst_count = 0 def_count += 1 expansion_of_last_acronym = acronym_expansion values_for_this_acronym[index].append(def_count) defs_per_acronym[min(def_count, len(defs_per_acronym) - 1)] += 1 acronymDB[acronym] = numpy.array(values_for_this_acronym) dump(acronymDB) common_logger.info("Dumped AcronymDB successfully")