예제 #1
0
    def import_articles(filename):
        '''
        Used to import a file directly into the database
        '''
        num_to_insert = 500
        article_list = []
        article_cols = [
            'id',
            'external_id',
            'source',
            'orig_file',
            'journal',
            'print_issn',
            'e_issn',
            'journal_unique_id',
            'year',
            'article_type',
            'article_section',
            'authors',
            'author_emails',
            'author_affiliations',
            'keywords',
            'title',
            'abstract',
            'vol',
            'issue',
            'page',
            'pmid',
            'pmc_id',
            'doi',
            'fulltext_url',
            'time',
        ]

        with gzip.open(filename) as f:
            reader = csv.reader(f, delimiter="\t")
            count = 0

            for row in reader:
                if reader.line_num == 1:
                    headers = row
                    headers = map(convert, headers)
                    if (headers[0] == '#article_id'):
                        headers[0] = 'id'
                elif len(article_list) >= num_to_insert:
                    Article.objects.bulk_create(article_list)
                    print "bulk updating %s articles" % (len(article_list))
                    article_list = []
                else:
                    line_dict = {}
                    line_dict = dict(zip(headers, row))
                    article_dict = sub_dict_remove_strict(
                        line_dict, article_cols)
                    if line_dict:
                        raise Exception('Unidentified column in file')
                    # convert int columns from string to int
                    ints = ['year', 'page', 'pmid', 'pmc_id']
                    for field in ints:
                        if (article_dict[field] == ''):
                            article_dict[field] = None
                        else:
                            article_dict[field] = int(article_dict[field])

                    # convert timestamp to time object
                    try:
                        if (article_dict['time'] == ''):
                            article_dict['time'] = None
                        else:
                            article_dict['time'] = time.strftime("%Y-%m-%d %T", time.strptime(article_dict['time']))
                    except Exception as e:
                        print e
                        print article_dict['time']
                    # append article to list to be bulk created
                    count += 1
                    article_list.append(Article(**article_dict))

            Article.objects.bulk_create(article_list)
예제 #2
0
    def import_tsv_old(sample, filename):
        headers = None
#        num_to_insert = 100
#        clonotype_list = []
#        amino_acid_list = []
#        recombination_list = []
        reader = csv.reader(open(filename, 'r'), delimiter="\t")

        recombination_cols = ['nucleotide',
                              'v_family_name',
                              'v_gene_name',
                              'v_ties',
                              'd_gene_name',
                              'j_gene_name',
                              'j_ties',
                              'sequence_status',
                              'v_deletion',
                              'd5_deletion',
                              'd3_deletion',
                              'j_deletion',
                              'n2_insertion',
                              'n1_insertion',
                              'v_index',
                              'n1_index',
                              'n2_index',
                              'd_index',
                              'j_index',
                              'cdr3_length',
                              ]
        amino_acid_cols = ['amino_acid']
        clonotype_cols = [
            'sequence_id',
            'container',
            'normalized_frequency',
            'normalized_copy',
            'raw_frequency',
            'copy',
        ]

        for row in reader:
            if reader.line_num == 1:
                headers = row
                headers = map(convert, headers)
            else:
                line_dict = {}
                line_dict = dict(zip(headers, row))

# split into clonotype, recombination and amino acid dicts
                amino_acid_dict = sub_dict_remove_strict(
                    line_dict, amino_acid_cols)
                recombination_dict = sub_dict_remove_strict(
                    line_dict, recombination_cols)
                clonotype_dict = sub_dict_remove_strict(
                    line_dict, clonotype_cols)

# throw error if any leftover columns
                if line_dict:
                    raise Exception('Unidentified column in file')
                if clonotype_dict['normalized_frequency'] == '':
                    raise Exception('Normalized_frequency cannot be null')
                if not amino_acid_dict['amino_acid'] == '':
                    # Fix discrepancies between column and model fields
                    amino_acid_dict['sequence'] = amino_acid_dict['amino_acid']
                    del amino_acid_dict['amino_acid']
                    aa, created = AminoAcid.objects.get_or_create(
                        **amino_acid_dict)
#                    aa.samples.add(sample)
#                    aa.save()
                    r, created = Recombination.objects.get_or_create(
                        amino_acid=aa, **recombination_dict)
                else:
                    r, created = Recombination.objects.get_or_create(
                        **recombination_dict)
                clonotype_dict['recombination'] = r
                clonotype_dict['sample'] = sample
                Clonotype.objects.create(**clonotype_dict)