def remove_duplicates(by='pmid'): """Remove duplicate articles by field. :param str by: Article field to identify duplicates """ counts = collections.defaultdict(int) values = [ value[by] for value in Article._storage[0].store.find({}, {by: 1}) ] for value in values: counts[value] += 1 for value, count in counts.items(): if count == 1: continue articles = list(Article.find(Q(by, 'eq', value))) for duplicate in articles[1:]: logger.debug( 'Deleting duplicate record: {}'.format( value ) ) Article.remove_one(duplicate)
def remove_duplicates(by='pmid'): """Remove duplicate articles by field. :param str by: Article field to identify duplicates """ counts = collections.defaultdict(int) values = [ value[by] for value in Article._storage[0].store.find({}, {by: 1}) ] for value in values: counts[value] += 1 for value, count in counts.items(): if count == 1: continue articles = list(Article.find(Q(by, 'eq', value))) for duplicate in articles[1:]: logger.debug('Deleting duplicate record: {}'.format(value)) Article.remove_one(duplicate)