Verify that there were not ambiguous indices created for different eLife uploads. ''' indices = [] unique = 0 nonunique = 0 uniq = True for meas in measurements: index_string = '' for field in meas['index']: index_string = index_string + str(field) if index_string in indices: print "Nonunique index field: ", index_string nonunique += 1 uniq = False else: indices.append(index_string) unique += 1 print "Unique fields: ", unique print "Nonunique fields: ", nonunique return uniq if __name__ == "__main__": args = parser.parse_args() if args.path is None: args.path = "data/" if not os.path.isdir(args.path): os.makedirs(args.path) connTDB = elife_upload(**args.__dict__) connTDB.upload(**args.__dict__)
SeqIO.write([oseq, record], "temp_in.fasta", "fasta") os.system("mafft --auto temp_in.fasta > temp_out.fasta 2>tmp") tmp_aln = np.array(AlignIO.read('temp_out.fasta', 'fasta')) scores.append((olineage, (tmp_aln[0]==tmp_aln[1]).sum())) scores.sort(key = lambda x:x[1], reverse=True) if scores[0][1]>min_score_percentage*len(record.seq): print("Lineage based on similarity:", scores[0][0], doc['strain'], len(record.seq), scores) return self.outgroup_patterns[scores[0][0]] else: print("Couldn't parse virus subtype and lineage from aligning sequence: ", doc['strain'], len(record.seq), scores) return None except: print("Alignment failed: " + doc['strain']) return None if __name__=="__main__": args = parser.parse_args() sequence_fasta_fields = {0: 'accession', 1: 'strain', 2: 'isolate_id', 3:'locus', 4: 'passage', 5: 'submitting_lab'} # >>B/Austria/896531/2016 | EPI_ISL_206054 | 687738 | HA | Siat 1 setattr(args, 'fasta_fields', sequence_fasta_fields) xls_fields_wanted = [('strain', 'Isolate_Name'), ('isolate_id', 'Isolate_Id'), ('collection_date', 'Collection_Date'), ('host', 'Host'), ('Subtype', 'Subtype'), ('Lineage', 'Lineage'), ('gisaid_location', 'Location'), ('originating_lab', 'Originating_Lab'), ('Host_Age', 'Host_Age'), ('Host_Age_Unit', 'Host_Age_Unit'), ('gender', 'Host_Gender'), ('submission_date', 'Submission_Date')] setattr(args, 'xls_fields_wanted', xls_fields_wanted) if args.path is None: args.path = "data/" if not os.path.isdir(args.path): os.makedirs(args.path) connVDB = flu_upload(**args.__dict__) connVDB.upload(**args.__dict__)
def separate_viruses_sequences(self, data, **kwargs): viruses = [] sequences = [] for record in data: v = {k: v for k,v in record.items() if k in virus_attribs} # defined in __main__ below s = {k: v for k,v in record.items() if k in sequence_attribs} v = self.add_virus_fields(v, **kwargs) # add attributes specified at command line, and universal fields like 'number of sequences' s = self.add_sequence_fields(s, **kwargs) sequences.append(s) viruses.append(v) return (viruses, sequences) if __name__=="__main__": args = parser.parse_args() # parser is an argparse object initiated in parse.py virus_attribs = ['strain', 'original_strain', 'virus', 'serotype','collection_date', 'region', 'country', 'division', 'location'] # define fields in fasta headers that you want used in parse.py > parse > parse_fasta_file ---> (viruses, sequences) sequence_attribs = ['accession', 'strain', 'original_strain', 'virus', 'serotype', 'locus', 'sequence', 'authors', 'PMID', 'source', 'gene_list'] if args.fname == None: setattr(args, 'fname', 'results.tbl') setattr(args, 'ftype', 'tsv') if args.virus == None: setattr(args, 'virus', 'dengue') if args.database == None: setattr(args, 'database', 'vdb') setattr(args, 'virus_attribs', virus_attribs) setattr(args, 'sequence_attribs', sequence_attribs) connVDB = dengue_upload(**args.__dict__) connVDB.upload(**args.__dict__)