def import_articles(filename): ''' Used to import a file directly into the database ''' num_to_insert = 500 article_list = [] article_cols = [ 'id', 'external_id', 'source', 'orig_file', 'journal', 'print_issn', 'e_issn', 'journal_unique_id', 'year', 'article_type', 'article_section', 'authors', 'author_emails', 'author_affiliations', 'keywords', 'title', 'abstract', 'vol', 'issue', 'page', 'pmid', 'pmc_id', 'doi', 'fulltext_url', 'time', ] with gzip.open(filename) as f: reader = csv.reader(f, delimiter="\t") count = 0 for row in reader: if reader.line_num == 1: headers = row headers = map(convert, headers) if (headers[0] == '#article_id'): headers[0] = 'id' elif len(article_list) >= num_to_insert: Article.objects.bulk_create(article_list) print "bulk updating %s articles" % (len(article_list)) article_list = [] else: line_dict = {} line_dict = dict(zip(headers, row)) article_dict = sub_dict_remove_strict( line_dict, article_cols) if line_dict: raise Exception('Unidentified column in file') # convert int columns from string to int ints = ['year', 'page', 'pmid', 'pmc_id'] for field in ints: if (article_dict[field] == ''): article_dict[field] = None else: article_dict[field] = int(article_dict[field]) # convert timestamp to time object try: if (article_dict['time'] == ''): article_dict['time'] = None else: article_dict['time'] = time.strftime("%Y-%m-%d %T", time.strptime(article_dict['time'])) except Exception as e: print e print article_dict['time'] # append article to list to be bulk created count += 1 article_list.append(Article(**article_dict)) Article.objects.bulk_create(article_list)
def import_tsv_old(sample, filename): headers = None # num_to_insert = 100 # clonotype_list = [] # amino_acid_list = [] # recombination_list = [] reader = csv.reader(open(filename, 'r'), delimiter="\t") recombination_cols = ['nucleotide', 'v_family_name', 'v_gene_name', 'v_ties', 'd_gene_name', 'j_gene_name', 'j_ties', 'sequence_status', 'v_deletion', 'd5_deletion', 'd3_deletion', 'j_deletion', 'n2_insertion', 'n1_insertion', 'v_index', 'n1_index', 'n2_index', 'd_index', 'j_index', 'cdr3_length', ] amino_acid_cols = ['amino_acid'] clonotype_cols = [ 'sequence_id', 'container', 'normalized_frequency', 'normalized_copy', 'raw_frequency', 'copy', ] for row in reader: if reader.line_num == 1: headers = row headers = map(convert, headers) else: line_dict = {} line_dict = dict(zip(headers, row)) # split into clonotype, recombination and amino acid dicts amino_acid_dict = sub_dict_remove_strict( line_dict, amino_acid_cols) recombination_dict = sub_dict_remove_strict( line_dict, recombination_cols) clonotype_dict = sub_dict_remove_strict( line_dict, clonotype_cols) # throw error if any leftover columns if line_dict: raise Exception('Unidentified column in file') if clonotype_dict['normalized_frequency'] == '': raise Exception('Normalized_frequency cannot be null') if not amino_acid_dict['amino_acid'] == '': # Fix discrepancies between column and model fields amino_acid_dict['sequence'] = amino_acid_dict['amino_acid'] del amino_acid_dict['amino_acid'] aa, created = AminoAcid.objects.get_or_create( **amino_acid_dict) # aa.samples.add(sample) # aa.save() r, created = Recombination.objects.get_or_create( amino_acid=aa, **recombination_dict) else: r, created = Recombination.objects.get_or_create( **recombination_dict) clonotype_dict['recombination'] = r clonotype_dict['sample'] = sample Clonotype.objects.create(**clonotype_dict)