def main(args):
    # connect to database
    session = app.db.session()

    logging.info("Reading sample information")
    # Add all samples from the sample info file
    sample_info = pd.read_table(args.sample_info, sep=',', index_col=0)

    logging.info("Creating sample sets")
    # sample_set
    sample_sets = {}
    for sample_set_name, sample_set_df in sample_info.groupby('sample_set'):
        if len(SampleSet.query.filter_by(name=sample_set_name).all()) == 0:
            sample_set = SampleSet(sample_set_name)
        for sample_id, row in sample_set_df.iterrows():
            sample_sets[sample_id] = sample_set

    logging.info("Creating individual samples")
    sample_properties = []
    all_samples = {}
    time_places = []
    metadata_reference = pd.read_table(args.metadata_reference, index_col=0)
    meta_categories = list(metadata_reference.index)
    default_units = metadata_reference['Unit'].to_dict()
    default_units['filter_lower'] = 'µm'
    default_units['filter_upper'] = 'µm'
    for sample_id, row in sample_info.iterrows():
        samples_with_code = Sample.query.filter_by(scilifelab_code=sample_id).all()
        assert len(samples_with_code) == 0

        meta_data = {}

        for meta_category in meta_categories:
            if meta_category == 'Collection date':
                date = datetime.datetime.strptime(row[meta_category], '%y/%m/%d')
            if meta_category == 'Collection time':
                time = datetime.datetime.strptime(str(row[meta_category]), '%H:%M').time()
            else:
                meta_data[meta_category] = row[meta_category]

        extra_categories = ['filter_lower', 'filter_upper']
        for meta_category in extra_categories:
            meta_data[meta_category] = row[meta_category]

        time_place = TimePlace(datetime.datetime.combine(date, time), meta_data['Latitude'], meta_data['Longitude'])
        time_places.append(time_place)

        all_samples[sample_id] = Sample(sample_id, sample_sets[sample_id], time_place)

        for meta_category in meta_categories:
            if meta_category in ['Latitude', 'Longitude', 'Collection date', 'Collection time']:
                continue
            if meta_data[meta_category] is not None:
                sample_properties.append(SampleProperty(meta_category, meta_data[meta_category], default_units[meta_category], all_samples[sample_id]))

    session.add_all(list(sample_sets.values()) + list(all_samples.values()) + time_places + sample_properties)

    logging.info("Commiting everything except gene counts")
    session.commit()

    commited_genes = # fetch from database

    # Fetch each gene from the gene count file and create the corresponding gene count
    logging.info("Starting with gene counts")
    gene_counts = pd.read_table(args.gene_counts, index_col=0)
    total_gene_count_len = len(gene_counts)
    val_cols = gene_counts.columns
    nr_columns = len(val_cols)

    filtered_gene_counts = gene_counts[ gene_counts.index.isin(commited_genes.keys()) ].copy()
    filtered_gene_counts['gene_name'] = filtered_gene_counts.index
    filtered_gene_counts['gene_id'] = filtered_gene_counts['gene_name'].apply(lambda x: commited_genes[x])

    def add_gene_counts_to_file(col, filtered_gene_counts, sample_id):
        tmp_cov_df = filtered_gene_counts[[col, 'gene_id']].copy()
        tmp_cov_df['rpkm'] = tmp_cov_df[col]
        tmp_cov_df['sample_id'] = sample_id
        with open(args.tmp_file, 'w') as gene_counts_file:
            tmp_cov_df[['gene_id', 'sample_id', 'rpkm']].to_csv(gene_counts_file, index=False, header=False)

    all_sample_ids = dict((sample_name, sample.id) for sample_name, sample in all_samples.items())
    filtered_gene_counts.rename(columns=all_sample_ids, inplace=True)

    sample_id_cols = filtered_gene_counts.columns.tolist()
    sample_id_cols.remove('gene_id')
    sample_id_cols.remove('gene_name')

    filtered_gene_counts.index = filtered_gene_counts['gene_id']
    filtered_gene_counts = pd.DataFrame(filtered_gene_counts[sample_id_cols].stack())
    filtered_gene_counts.reset_index(inplace=True)
    filtered_gene_counts.columns = ['gene_id', 'sample_id', 'rpkm']

    tot_nr_samples = len(all_samples.values())
    logging.info("Start adding gene counts")

    for i, sample_t in enumerate(filtered_gene_counts.groupby('sample_id')):
        sample, sample_df = sample_t
        with open(args.tmp_file, 'w') as gene_counts_file:
            sample_df.to_csv(gene_counts_file, index=False, header=False)

        logging.info("Adding gene counts from file. Sample {}/{}".format(i+1, tot_nr_samples))
        session.execute("COPY gene_count (gene_id, sample_id, rpkm) FROM '{}' WITH CSV;".format(args.tmp_file))

    logging.info("{} out of {} are annotated genes".format(len(filtered_gene_counts), total_gene_count_len))
    session.commit()

    logging.info("Refreshing materialized view")
    refresh_all_mat_views()
    session.commit()
    logging.info("Finished!")
예제 #2
0
    def test_annotation_rpkm_table(self):
        annotation_types = [("Cog", {'class': Cog}),
                ("Pfam", {'class': Pfam}),
                ("TigrFam", {'class': TigrFam}),
                ("EcNumber", {'class': EcNumber})]

        nr_annotation_types = len(annotation_types)
        annotation_sources = {}
        for annotation_type, type_d in annotation_types:
            annotation_sources[annotation_type]= AnnotationSource(
                    annotation_type,
                    "v1.0",
                    "rpsblast",
                    "e_value=0.000001"
                )

        sample1 = Sample("P1993_101", None, None)
        sample2 = Sample("P1993_102", None, None)
        nr_samples = 2
        for i in range(50):
            gene1 = Gene("gene1{}".format(i), None)
            gene2 = Gene("gene2{}".format(i), None)

            gene_count1 = GeneCount(gene1, sample1, 0.001)
            gene_count2 = GeneCount(gene1, sample2, 0.01)
            gene_count3 = GeneCount(gene2, sample1, 0.002)
            gene_count4 = GeneCount(gene2, sample2, 0.02)

            for annotation_type, type_d in annotation_types:
                if annotation_type == 'Cog':
                    type_id = str(i)
                    type_id = "0"*(4-len(type_id))+type_id
                    annotation = type_d['class'](annotation_type.upper() + type_id, "H")
                elif annotation_type == 'EcNumber':
                    if i > 25:
                        type_id = "0.0.2.{}".format(i)
                    else:
                        type_id = "0.0.0.{}".format(i)
                    annotation = type_d['class'](type_id)
                else:
                    type_id = str(i)
                    type_id = "0"*(4-len(type_id))+type_id
                    annotation = type_d['class'](annotation_type.upper() + type_id)

                annotation_mode = i % 3
                gene_annotations = []
                if annotation_mode in [0,1]:
                    gene_annotations.append(GeneAnnotation(
                            annotation,
                            gene1,
                            annotation_sources[annotation_type]
                        ))
                if annotation_mode in [1,2]:
                    gene_annotations.append(GeneAnnotation(
                            annotation,
                            gene2,
                            annotation_sources[annotation_type]
                        ))
                self.session.add_all(gene_annotations)

            self.session.add(gene1)
            self.session.add(gene2)
        self.session.commit()
        refresh_all_mat_views()
        samples, rows = Annotation.rpkm_table()
        assert len(samples) == 2
        assert len(rows) == 20 # Default limit
        samples, rows = Annotation.rpkm_table(limit=100)
        assert len(samples) == 2
        assert len(rows) == 100
        samples, rows = Annotation.rpkm_table(limit=None)
        assert len(samples) == 2
        assert len(rows) == nr_annotation_types * 50

        for annotation, sample_d in rows.items():
            # sample_d should be a ordered dict
            assert ["P1993_101", "P1993_102"] == [sample.scilifelab_code for sample in sample_d.keys()]
        rpkms = [[rpkm for sample, rpkm in sample_d.items()] for annotation, sample_d in rows.items()]

        rpkms_flat = []
        for rpkm_row in rpkms:
            rpkms_flat += rpkm_row

        assert len(rpkms_flat) == nr_annotation_types * nr_samples * 50

        # Annotations sorted by total rpkm over all samples
        # and the rpkm values should be summed over all genes for that annotation
        # there should be roughly equal numbers of the three different counts
        for i, row in enumerate(rpkms[:67]):
            assert row == [0.003, 0.03]
        for row in rpkms[69:130]:
            assert row == [0.002, 0.02]
        for row in rpkms[150:200]:
            assert row == [0.001, 0.01]

        # possible to filter on function classes
        for annotation_type, type_d in annotation_types:
            samples, rows = Annotation.rpkm_table(limit=None, function_class=annotation_type.lower())
            assert len(rows) == 50
            for key in rows.keys():
                assert annotation_type[:3].lower() == key.annotation_type[:3]

        # possible to filter on samples
        for sample in [sample1, sample2]:
            samples, rows = Annotation.rpkm_table(samples=[sample.scilifelab_code], limit=None)
            assert len(rows) == 200
            assert len(samples) == 1
            assert samples[0] == sample
            for annotation, sample_d in rows.items():
                assert list(sample_d.keys()) == [sample]

            rpkms = [[rpkm for sample, rpkm in sample_d.items()] for annotation, sample_d in rows.items()]
            if sample.scilifelab_code == "P1993_101":
                for i, row in enumerate(rpkms[:65]):
                    assert row == [0.003]
                for row in rpkms[69:130]:
                    assert row == [0.002]
                for row in rpkms[150:200]:
                    assert row == [0.001]
            else:
                for row in rpkms[:67]:
                    assert row == [0.03]
                for row in rpkms[69:130]:
                    assert row == [0.02]
                for row in rpkms[150:200]:
                    assert row == [0.01]

        # possible to filter on sample and function class at the same time
        for annotation_type, type_d in annotation_types:
            for sample in [sample1, sample2]:
                samples, rows = Annotation.rpkm_table(limit=None, function_class=annotation_type.lower(), samples=[sample.scilifelab_code])
                assert len(rows) == 50
                for key in rows.keys():
                    assert annotation_type.lower()[:3] == key.annotation_type[:3]

                assert len(samples) == 1
                assert samples[0] == sample
                for annotation, sample_d in rows.items():
                    assert list(sample_d.keys()) == [sample]

                rpkms = [[rpkm for sample, rpkm in sample_d.items()] for annotation, sample_d in rows.items()]
                if sample.scilifelab_code == "P1993_101":
                    for row in rpkms[:9]:
                        assert row == [0.003]
                    for row in rpkms[19:29]:
                        assert row == [0.002]
                    for row in rpkms[39:]:
                        assert row == [0.001]
                else:
                    for row in rpkms[:9]:
                        assert row == [0.03]
                    for row in rpkms[19:29]:
                        assert row == [0.02]
                    for row in rpkms[39:]:
                        assert row == [0.01]

        # possible to filter on individual annotations
        annotation_ids = ["COG0001", "TIGRFAM0004", "COG0003", "PFAM0002", "0.0.2.26"]

        for r in range(5):
            for type_identifiers in itertools.combinations(annotation_ids, r+1):

                samples, rows = Annotation.rpkm_table(limit=None, type_identifiers=list(type_identifiers))
                assert len(samples) == 2
                assert len(rows) == len(type_identifiers)
                assert set([key.type_identifier for key in rows.keys()]) == set(type_identifiers)
예제 #3
0
    def test_taxon(self):
        ref_assembly = ReferenceAssembly("Version 1")
        gene1 = Gene("gene1", ref_assembly)

        sample1 = Sample("P1993_101", None, None)
        reference_assembly = ReferenceAssembly("version 1")
        gene_count1 = GeneCount(gene1, sample1, 0.001)
        taxon1 = Taxon(superkingdom="Bacteria", phylum="Proteobacteria")
        gene1.taxon = taxon1
        self.session.add(gene1)
        self.session.add(taxon1)
        self.session.add(sample1)
        self.session.add(gene_count1)
        self.session.commit()

        gene1 = Gene.query.first()
        taxon1 = Taxon.query.first()

        assert gene1.taxon == taxon1
        assert gene1 in taxon1.genes
        assert taxon1.superkingdom == 'Bacteria'
        assert taxon1.phylum == 'Proteobacteria'
        assert taxon1.taxclass == ''
        assert taxon1.full_taxonomy == 'Bacteria;Proteobacteria;;;;;;'
        refresh_all_mat_views()

        # Test sample count retreival
        sample2 = Sample("P1993_102", None, None)
        self.session.add(sample2)
        self.session.commit()
        refresh_all_mat_views()
        assert taxon1.rpkm == {sample1: 0.001}

        gene_count2 = GeneCount(gene1, sample2, 0.2)
        self.session.add(gene_count2)
        self.session.commit()
        refresh_all_mat_views()
        assert taxon1.rpkm == {sample1: 0.001, sample2: 0.2}

        gene2 = Gene("gene2", ref_assembly)
        gene_count3 = GeneCount(gene2, sample2, 0.1)

        self.session.add(gene2)
        self.session.add(gene_count3)
        self.session.commit()
        refresh_all_mat_views()

        # taxon1.rpkm should still be the same since the new gene is not connected to taxon1
        assert taxon1.rpkm == {sample1: 0.001, sample2: 0.2}

        taxon2 = Taxon(superkingdom="Eukaryota", phylum="Chlorophyta")
        gene2.taxon = taxon2
        self.session.add(taxon2)
        self.session.add(gene2)
        self.session.commit()
        refresh_all_mat_views()

        # Taxon2 should have gene_count3 stats only
        assert taxon2.rpkm == {sample2: 0.1}

        gene3 = Gene("gene3", ref_assembly, taxon_id=taxon1.id)
        gene_count4 = GeneCount(gene3, sample1, 1.0)

        self.session.add(gene3)
        self.session.add(gene_count4)
        self.session.commit()

        # Taxon1 should now have the original stats plus gene_count4
        assert taxon1.rpkm == {sample1: 1.001, sample2: 0.2}


        taxon3 = Taxon(superkingdom="Eukaryota", phylum="Unnamed", taxclass="Dinophyceae")
        self.session.add(taxon3)
        self.session.commit()
        gene4 = Gene("gene4", ref_assembly, taxon_id=taxon3.id)
        gene_count5 = GeneCount(gene4, sample2, 0.003)

        self.session.add(gene4)
        self.session.add(gene_count5)
        self.session.commit()
        refresh_all_mat_views()

        # theoretical rpkm_table:
        # samples = [sample1, sample2]
        # rpkm_table = {"Bacteria": {"P1993_101": 1.001, "P1993_102": 0.2}, "Eukaryota": {"P1993_102": 0.103}}
        samples, rpkm_table, complete_val_to_val = Taxon.rpkm_table()
        assert samples == [sample1, sample2]
        assert [complete_val_to_val[complete_level_val] for complete_level_val in rpkm_table.keys()] == ["Bacteria", "Eukaryota"] # Sorted by summed rpkm
        assert rpkm_table[("Bacteria")] == {sample1: 1.001, sample2: 0.2}
        assert rpkm_table[("Eukaryota")] == {sample2: 0.103}

        samples, rpkm_table, complete_val_to_val= Taxon.rpkm_table(level='phylum')
        assert samples == [sample1, sample2]
        assert [complete_val_to_val[complete_level_val] for complete_level_val in rpkm_table.keys()] == ["Proteobacteria", "Chlorophyta", "Unnamed"] # Sorted by summed rpkm

        assert rpkm_table[("Bacteria;Proteobacteria")] == {sample1: 1.001, sample2: 0.2}
        assert rpkm_table[("Eukaryota;Chlorophyta")] == {sample2: 0.1}
        assert rpkm_table[("Eukaryota;Unnamed")] == {sample2: 0.003}
예제 #4
0
    def test_taxon_large_scale_rpkm_table(self):
        sample1 = Sample("P1993_101", None, None)
        sample2 = Sample("P1993_102", None, None)
        nr_samples = 2
        taxons = []
        for euk_i in range(2):
            for ph_i in range(3):
                for tc_i in range(20):
                    taxons.append(Taxon(superkingdom="sk_{}".format(euk_i),
                        phylum="ph_{}".format(ph_i),
                        taxclass="tc_{}".format(tc_i)))

        self.session.add_all(taxons)
        self.session.commit()
        refresh_all_mat_views()

        for i,taxon in enumerate(taxons):
            count_mode = i % 3
            gene_counts = []

            gene1 = Gene("gene1{}".format(i), None, taxon_id=taxon.id)
            gene2 = Gene("gene2{}".format(i), None, taxon_id=taxon.id)

            if count_mode in [0,1]:
                gene_counts.append(GeneCount(gene1, sample1, 0.001))
                gene_counts.append(GeneCount(gene1, sample2, 0.01))
            if count_mode in [1,2]:
                gene_counts.append(GeneCount(gene2, sample1, 0.002))
                gene_counts.append(GeneCount(gene2, sample2, 0.02))

            self.session.add_all(gene_counts)

            self.session.add(gene1)
            self.session.add(gene2)

        self.session.commit()
        refresh_all_mat_views()

        samples, rows, complete_val_to_val = Taxon.rpkm_table()
        assert len(samples) == 2
        assert len(rows) == 2 # Number of unique superkingdoms

        samples, rows, complete_val_to_val = Taxon.rpkm_table(level="phylum")
        assert len(samples) == 2
        assert len(rows) == 6 # Number of unique down to phylum

        samples, rows, complete_val_to_val = Taxon.rpkm_table(level="taxclass")
        assert len(samples) == 2
        assert len(rows) == 20 # Default limit

        samples, rows, complete_val_to_val = Taxon.rpkm_table(level="taxclass", limit=None)
        assert len(samples) == 2
        assert len(rows) == 120 # Number of unique down to taxclass

        samples, rows, complete_val_to_val = Taxon.rpkm_table(level="taxclass", limit=None)

        for taxon, sample_d in rows.items():
            # sample_d should be a ordered dict
            assert ["P1993_101", "P1993_102"] == [sample.scilifelab_code for sample in sample_d.keys()]
        rpkms = [[rpkm for sample, rpkm in sample_d.items()] for taxon, sample_d in rows.items()]

        rpkms_flat = []
        for rpkm_row in rpkms:
            rpkms_flat += rpkm_row

        assert len(rpkms_flat) == 2 * 3 * 20 * nr_samples

        # Annotations sorted by total rpkm over all samples
        # and the rpkm values should be summed over all genes for that taxon
        # there should be roughly equal numbers of the three different counts
        for i, row in enumerate(rpkms[:40]):
            assert row == [0.003, 0.03]
        for row in rpkms[40:80]:
            assert row == [0.002, 0.02]
        for row in rpkms[80:120]:
            assert row == [0.001, 0.01]

        # possible to filter on specific level values at superkingdom
        for level_val in ["sk_0", "sk_1"]:
            samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[level_val], top_level="superkingdom", level="phylum")
            assert len(rows) == 3
            level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()]
            assert level_vals == ["ph_2", "ph_0", "ph_1"]
            samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[level_val], top_level="superkingdom", level="taxclass")
            assert len(rows) == 3*20


        # possible to filter on specific level values at phylum
        for sk_level_val in ["sk_0", "sk_1"]:
            for ph_level_val in ["ph_0", "ph_1", "ph_2"]:
                top_level_complete_value="{};{}".format(sk_level_val, ph_level_val)
                samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[top_level_complete_value], top_level="phylum", level="phylum")
                assert len(rows) == 1
                level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()]
                assert level_vals == [ph_level_val]
                samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[top_level_complete_value], top_level="phylum", level="taxclass")
                assert len(rows) == 20

        # possible to filter on multiple specific level values at phylum
        for sk_level_val in ["sk_0", "sk_1"]:
            for ph_level_vals in itertools.combinations(["ph_0", "ph_1", "ph_2"], 2):
                top_level_complete_values = []
                for ph_level_val in ph_level_vals:
                    top_level_complete_values.append("{};{}".format(sk_level_val, ph_level_val))
                samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=top_level_complete_values, top_level="phylum", level="phylum")
                assert len(rows) == 2
                level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()]
                assert sorted(level_vals) == sorted(list(ph_level_vals))
                samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=top_level_complete_values, top_level="phylum", level="taxclass")
                assert len(rows) == 40

        # possible to filter on specific level values at taxclass
        for sk_level_val in ["sk_0", "sk_1"]:
            for ph_level_val in ["ph_0", "ph_1", "ph_2"]:
                for tc_level_val in ["tc_{}".format(i) for i in range(20)]:
                    top_level_complete_value="{};{};{}".format(sk_level_val, ph_level_val, tc_level_val)
                    samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[top_level_complete_value], top_level="taxclass", level="taxclass")
                    assert len(rows) == 1

        # possible to filter on specific level values at taxclass
        for sk_level_val in ["sk_0", "sk_1"]:
            for ph_level_val in ["ph_0", "ph_1", "ph_2"]:
                for tc_level_vals in itertools.combinations(["tc_{}".format(i) for i in range(5)], 4):
                    top_level_complete_values = []
                    for tc_level_val in tc_level_vals:
                        top_level_complete_values.append("{};{};{}".format(sk_level_val, ph_level_val, tc_level_val))
                    samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=top_level_complete_values, top_level="taxclass", level="taxclass")
                    assert len(rows) == 4

        # possible to filter on samples
        for sample in [sample1, sample2]:
            samples, rows, complete_val_to_val = Taxon.rpkm_table(samples=[sample.scilifelab_code], level="taxclass", limit=None)
            assert len(rows) == 120
            assert len(samples) == 1
            assert samples[0] == sample
            for taxon, sample_d in rows.items():
                assert list(sample_d.keys()) == [sample]

            rpkms = [[rpkm for sample, rpkm in sample_d.items()] for taxon, sample_d in rows.items()]
            if sample.scilifelab_code == "P1993_101":
                for i, row in enumerate(rpkms[:40]):
                    assert row == [0.003]
                for row in rpkms[40:80]:
                    assert row == [0.002]
                for row in rpkms[80:120]:
                    assert row == [0.001]
            else:
                for row in rpkms[:40]:
                    assert row == [0.03]
                for row in rpkms[40:80]:
                    assert row == [0.02]
                for row in rpkms[80:120]:
                    assert row == [0.01]

        # possible to filter on sample and taxon at the same time
        for sample in [sample1, sample2]:
            for sk_level_val in ["sk_0", "sk_1"]:
                top_level_complete_value = sk_level_val
                samples, rows, complete_val_to_val = Taxon.rpkm_table(samples=[sample.scilifelab_code], limit=None, top_level_complete_values=[top_level_complete_value], top_level="superkingdom", level="phylum")
                assert len(samples) == 1
                assert samples[0] == sample
                for taxon, sample_d in rows.items():
                    assert list(sample_d.keys()) == [sample]

                assert len(rows) == 3
                level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()]
                assert level_vals == ["ph_2", "ph_0", "ph_1"]
                samples, rows, complete_val_to_val = Taxon.rpkm_table(samples=[sample.scilifelab_code], limit=None, top_level_complete_values=[top_level_complete_value], top_level="superkingdom", level="taxclass")
                assert len(rows) == 3*20


                rpkms = [[rpkm for sample, rpkm in sample_d.items()] for annotation, sample_d in rows.items()]
                if sample.scilifelab_code == "P1993_101":
                    for row in rpkms[:20]:
                        assert row == [0.003]
                    for row in rpkms[20:40]:
                        assert row == [0.002]
                    for row in rpkms[40:60]:
                        assert row == [0.001]
                else:
                    for row in rpkms[:20]:
                        assert row == [0.03]
                    for row in rpkms[20:40]:
                        assert row == [0.02]
                    for row in rpkms[40:80]:
                        assert row == [0.01]
예제 #5
0
def main(args):
    # Create materialized view that is not created by manage.py
    app.db.create_all()

    # connect to database
    session = app.db.session()

    logging.info("Reading sample information")
    # Add all samples from the sample info file
    sample_info = pd.read_table(args.sample_info, sep=',', index_col=0)

    logging.info("Creating sample sets")
    # sample_set
    sample_sets = {}
    for sample_set_name, sample_set_df in sample_info.groupby('sample_set'):
        if len(SampleSet.query.filter_by(name=sample_set_name).all()) == 0:
            sample_set = SampleSet(sample_set_name)
        for sample_id, row in sample_set_df.iterrows():
            sample_sets[sample_id] = sample_set

    logging.info("Creating individual samples")
    sample_properties = []
    all_samples = {}
    time_places = []
    metadata_reference = pd.read_table(args.metadata_reference, index_col=0)
    meta_categories = list(metadata_reference.index)
    default_units = metadata_reference['Unit'].to_dict()
    default_units['filter_lower'] = 'µm'
    default_units['filter_upper'] = 'µm'
    for sample_id, row in sample_info.iterrows():
        samples_with_code = Sample.query.filter_by(scilifelab_code=sample_id).all()
        assert len(samples_with_code) == 0

        meta_data = {}

        for meta_category in meta_categories:
            if meta_category == 'Collection date':
                date = datetime.datetime.strptime(row[meta_category], '%y/%m/%d')
            if meta_category == 'Collection time':
                time = datetime.datetime.strptime(str(row[meta_category]), '%H:%M').time()
            else:
                meta_data[meta_category] = row[meta_category]

        extra_categories = ['filter_lower', 'filter_upper']
        for meta_category in extra_categories:
            meta_data[meta_category] = row[meta_category]

        time_place = TimePlace(datetime.datetime.combine(date, time), meta_data['Latitude'], meta_data['Longitude'])
        time_places.append(time_place)

        all_samples[sample_id] = Sample(sample_id, sample_sets[sample_id], time_place)

        for meta_category in meta_categories:
            if meta_category in ['Latitude', 'Longitude', 'Collection date', 'Collection time']:
                continue
            if meta_data[meta_category] is not None:
                sample_properties.append(SampleProperty(meta_category, meta_data[meta_category], default_units[meta_category], all_samples[sample_id]))

    session.add_all(list(sample_sets.values()) + list(all_samples.values()) + time_places + sample_properties)

    logging.info("Creating the reference assembly")
    # create the reference assembly
    ref_assemblies = ReferenceAssembly.query.filter_by(name=args.reference_assembly).all()
    if len(ref_assemblies) == 0:
        ref_assembly = ReferenceAssembly(args.reference_assembly)
    else:
        assert len(ref_assemblies) == 1
        ref_assembly = ref_assemlbies[0]

    session.add(ref_assembly)

    logging.info("Adding annotation information")
    # Make sure annotations are present
    annotation_info = pd.read_table(args.all_annotations, header=None, names=["type_identifier", "gene_name", "description"])
    annotation_models = {'COG': Cog, 'TIG': TigrFam, 'pfa': Pfam, 'PFA': Pfam}
    annotation_polymorphic_id = {'COG': 'cog', 'TIG': 'tigrfam', 'pfa': 'pfam', 'PFA': 'pfam'}

    all_annotations = {}
    annotation_info['annotation_type'] = annotation_info['type_identifier'].apply(lambda x: annotation_polymorphic_id[x[0:3]])
    annotation_info['type_grouping'] = annotation_info['type_identifier'].apply(lambda x: x[0:3])
    annotation_info['id'] = annotation_info.index
    annotation_info['category'] = None
    for type_grouping, annotation_info_subset in annotation_info.groupby('type_grouping'):
        if type_grouping == 'COG':
            logging.info("Commiting all COG annotation info")
            session.bulk_insert_mappings(Cog, annotation_info_subset[['id', 'type_identifier', 'annotation_type', 'category', 'description']].to_dict(orient='index').values())
        else:
            logging.info("Commiting all {} annotation info".format(type_grouping))
            session.bulk_insert_mappings(annotation_models[type_grouping], annotation_info_subset[['id', 'type_identifier', 'annotation_type', 'description']].to_dict(orient='index').values())
    all_annotations = dict( (annotation.type_identifier, annotation) for annotation in session.query(Annotation).all() )

    logging.info("Adding annotation source")
    # Create annotation source
    annotation_source_info = pd.read_table(args.annotation_source_info, sep=',', header=None, names=["annotation_type", "db_version", "algorithm", "algorithm_parameters"], index_col = 0)
    all_annotation_sources = {}
    for annotation_type, row in annotation_source_info.iterrows():
        all_annotation_sources[annotation_type] = AnnotationSource(annotation_type, row.db_version, row.algorithm, row.algorithm_parameters)

    session.add_all(list(all_annotation_sources.values()))


    logging.info("Commiting everything except genes and gene counts")
    session.commit()

    def add_genes_with_taxonomy(taxonomy_per_gene, commited_genes):
        gene_annotations = pd.read_table(taxonomy_per_gene, index_col=0)

        gene_annotations['taxclass'] = gene_annotations['class']

        taxonomy_columns = ["superkingdom", "phylum", "taxclass", "order", "family", "genus", "species"]

        # Only add genes with taxonomy given
        annotated_genes = gene_annotations[ ~ gene_annotations[taxonomy_columns].isnull().all(axis=1)]


        def add_taxa(full_taxonomy, taxonomy_columns):

            # We want to make a difference between unnamed phyla and unset phyla
            first = True
            rev_new_taxonomy = []
            for tax_val in reversed(full_taxonomy.split(';')):
                if tax_val is "":
                    if first:
                        tax_val = None
                    else:
                        tax_val = "Unnamed"
                else:
                    first = False
                rev_new_taxonomy.append(tax_val)

            new_taxa_d = dict(zip(taxonomy_columns, reversed(rev_new_taxonomy)))
            new_taxa = Taxon(**new_taxa_d)

            return new_taxa

        added_taxa = {}
        gene_to_taxa = {}
        annotated_genes = annotated_genes.fillna("")
        # The number of taxa is lower than the genes with taxonomic annotation
        annotated_genes['full_taxonomy'] = annotated_genes["superkingdom"] + ';' + \
                annotated_genes['phylum'] + ';' + \
                annotated_genes['taxclass'] + ';' +\
                annotated_genes['order'] + ';' +\
                annotated_genes['family'] + ';' +\
                annotated_genes['genus'] + ';' +\
                annotated_genes['species'] + ';'

        all_taxas = annotated_genes['full_taxonomy'].unique()
        all_taxas_to_be_created = {}
        first_full_taxa_to_real_full_taxa = {}
        for full_taxonomy in all_taxas:
            taxa = add_taxa(full_taxonomy, taxonomy_columns)
            first_full_taxa_to_real_full_taxa[full_taxonomy] = taxa.full_taxonomy
            all_taxas_to_be_created[taxa.full_taxonomy] = taxa

        annotated_genes['real_full_taxonomy'] = annotated_genes['full_taxonomy'].apply(lambda x: first_full_taxa_to_real_full_taxa[x])

        logging.info("Commiting all taxa")
        session.add_all(all_taxas_to_be_created.values())
        session.commit()

        logging.info("Creating genes with taxon information")

        all_created_taxa = dict(session.query(Taxon.full_taxonomy, Taxon.id).all() )

        annotated_genes['taxon_id'] = annotated_genes['real_full_taxonomy'].apply(lambda x: all_created_taxa[x])
        annotated_genes['name'] = annotated_genes.index
        annotated_genes["reference_assembly_id"] = ref_assembly.id

        with open(args.tmp_file, 'w') as gene_file:
            annotated_genes[['name', 'reference_assembly_id', 'taxon_id']].to_csv(gene_file, index=False, header=False)
        session.execute("COPY gene (name, reference_assembly_id, taxon_id) FROM '{}' WITH CSV;".format(args.tmp_file))
        commited_genes.update(dict( session.query(Gene.name, Gene.id).all() ))
        logging.info("{} genes present in database".format(len(commited_genes.keys())))

        return commited_genes


    commited_genes = {}
    commited_genes = add_genes_with_taxonomy(args.taxonomy_per_gene, commited_genes)

    logging.info("Processed {} genes for gene taxonomy, moving on to functional annotation".format(len(commited_genes.keys())))

    def add_genes_with_annotation(annotation_type, gene_annotation_arg, commited_genes, all_annotations, annotation_source):
        logging.info("Adding genes with {} annotations".format(annotation_type))
        gene_annotations = pd.read_table(gene_annotation_arg, header=None, names=["name", "type_identifier", "e_value"])

        # Only add genes once
        new_genes = gene_annotations[ ~ gene_annotations['name'].isin(commited_genes.keys()) ]

        new_genes_uniq = pd.DataFrame([new_genes['name'].unique()])
        new_genes_uniq = new_genes_uniq.transpose()
        new_genes_uniq.columns = ['name']
        new_genes_uniq["reference_assembly_id"] = ref_assembly.id

        logging.info("Commiting all {} genes.".format(annotation_type))

        with open(args.tmp_file, 'w') as gene_file:
            new_genes_uniq[['name', 'reference_assembly_id']].to_csv(gene_file, index=False, header=False)
        session.execute("COPY gene (name, reference_assembly_id) FROM '{}' WITH CSV;".format(args.tmp_file))

        commited_genes.update(dict( session.query(Gene.name, Gene.id).all() ))
        logging.info("{} genes present in database".format(len(commited_genes.keys())))

        gene_annotations['gene_id'] = gene_annotations['name'].apply(lambda x: commited_genes[x])
        gene_annotations['annotation_id'] = gene_annotations['type_identifier'].apply(lambda x: all_annotations[x].id)

        annotation_source = all_annotation_sources[annotation_type]
        gene_annotations['annotation_source_id'] = annotation_source.id

        logging.info("Commiting all {} gene anntations".format(annotation_type))
        with open(args.tmp_file, 'w') as gene_file:
            gene_annotations[['gene_id', 'annotation_id', 'annotation_source_id', 'e_value']].to_csv(gene_file, index=False, header=False)
        session.execute("COPY gene_annotation (gene_id, annotation_id, annotation_source_id, e_value) FROM '{}' WITH CSV;".format(args.tmp_file))
        session.commit()
        return commited_genes

    commited_genes = add_genes_with_annotation("Cog", args.gene_annotations_cog, commited_genes, all_annotations, all_annotation_sources["Cog"])
    commited_genes = add_genes_with_annotation("Pfam", args.gene_annotations_pfam, commited_genes, all_annotations, all_annotation_sources["Pfam"])
    commited_genes = add_genes_with_annotation("TigrFam", args.gene_annotations_tigrfam, commited_genes, all_annotations, all_annotation_sources["TigrFam"])

    logging.info("Processed {} genes in total, moving on to gene counts".format(len(commited_genes.keys())))

    # Fetch each gene from the gene count file and create the corresponding gene count
    logging.info("Starting with gene counts")
    gene_counts = pd.read_table(args.gene_counts, index_col=0)
    total_gene_count_len = len(gene_counts)
    val_cols = gene_counts.columns
    nr_columns = len(val_cols)

    filtered_gene_counts = gene_counts[ gene_counts.index.isin(commited_genes.keys()) ].copy()
    filtered_gene_counts['gene_name'] = filtered_gene_counts.index
    filtered_gene_counts['gene_id'] = filtered_gene_counts['gene_name'].apply(lambda x: commited_genes[x])

    def add_gene_counts_to_file(col, filtered_gene_counts, sample_id):
        tmp_cov_df = filtered_gene_counts[[col, 'gene_id']].copy()
        tmp_cov_df['rpkm'] = tmp_cov_df[col]
        tmp_cov_df['sample_id'] = sample_id
        with open(args.tmp_file, 'w') as gene_counts_file:
            tmp_cov_df[['gene_id', 'sample_id', 'rpkm']].to_csv(gene_counts_file, index=False, header=False)

    all_sample_ids = dict((sample_name, sample.id) for sample_name, sample in all_samples.items())
    filtered_gene_counts.rename(columns=all_sample_ids, inplace=True)

    sample_id_cols = filtered_gene_counts.columns.tolist()
    sample_id_cols.remove('gene_id')
    sample_id_cols.remove('gene_name')

    filtered_gene_counts.index = filtered_gene_counts['gene_id']
    filtered_gene_counts = pd.DataFrame(filtered_gene_counts[sample_id_cols].stack())
    filtered_gene_counts.reset_index(inplace=True)
    filtered_gene_counts.columns = ['gene_id', 'sample_id', 'rpkm']

    tot_nr_samples = len(all_samples.values())
    logging.info("Start adding gene counts")

    for i, sample_t in enumerate(filtered_gene_counts.groupby('sample_id')):
        sample, sample_df = sample_t
        with open(args.tmp_file, 'w') as gene_counts_file:
            sample_df.to_csv(gene_counts_file, index=False, header=False)

        logging.info("Adding gene counts from file. Sample {}/{}".format(i+1, tot_nr_samples))
        session.execute("COPY gene_count (gene_id, sample_id, rpkm) FROM '{}' WITH CSV;".format(args.tmp_file))

    logging.info("{} out of {} are annotated genes".format(len(filtered_gene_counts), total_gene_count_len))
    session.commit()

    logging.info("Refreshing materialized view")
    refresh_all_mat_views()
    session.commit()
    logging.info("Finished!")