def process_overhangs(repo, project, overhangs_file, instanceid, authorid, date): with open(overhangs_file) as file: inputlines = file.readlines() validate_input_file(inputlines[0], overhangs_file) if len(inputlines) <= 1: return collections = {} collections['authorid'] = authorid collections['datecreated'] = date collections['description'] = 'Overhangs described in ' + instanceid collections['name'] = 'overhang-' + instanceid collectionid = uuid.uuid4() collections['idcollection'] = collectionid repo['collections'].append(collections) repository.add_object_to_collection(repo, collectionid, project['idcollection'], 'COLLECTION', authorid, date) for line in inputlines: if ',' not in line: continue tokens = line.split(',') featurename = 'overhang-' + tokens[0] featuresequence = tokens[1].lower() featureid = repository.create_feature(repo, featurename, featuresequence, 'overhang', date) repository.add_object_to_collection(repo, collectionid, featureid, 'FEATURE', authorid, date)
def persist_part_feature(repo, vectorname, part, familyname, authorid, date): nsa, fiveprimeoverhang, threeprimeoverhang = get_nucseq_annotations( repo, vectorname) partseq = part['nucseq']['sequence'].strip().lower() start = len(fiveprimeoverhang['feature']['nucseq']['sequence'].strip()) end = len(partseq) - len( threeprimeoverhang['feature']['nucseq']['sequence'].strip()) partfeatureseq = partseq[start:end] partfeature = repository.create_feature(repo, 'Feature-' + part['name'], partfeatureseq, familyname, date) repository.add_feature_to_nucseq( repo, 'Feature-' + part['name'], part['nucseq'], partfeature, len(fiveprimeoverhang['feature']['nucseq']['sequence']), authorid, date)
def process_vectors(repo, project, vectorsfiles, directories, instanceid, authorid, date): for vectorsfile in vectorsfiles: with open(vectorsfile) as file: lines = file.readlines() validate_input_file(lines[0], vectorsfile) if len(lines) <= 1: continue vectors = {} vectors['authorid'] = authorid vectors['datecreated'] = date vectors['description'] = 'Vectors described in ' + instanceid collectionid = uuid.uuid4() vectors['idcollection'] = collectionid repo['collections'].append(vectors) repository.add_object_to_collection(repo, project['idcollection'], vectors['idcollection'], 'COLLECTION', authorid, date) lineno = 0 for line in lines[1:]: tokens = line.split(',') tokens = [t.strip() for t in tokens if len(t.strip()) > 0] if len(tokens) < 5: raise ValueError( 'The Values.csv file does not have the required number of tokens on line ' + lineno) vectorfilename = tokens[0] vectorname = 'Vector-' + tokens[1] resistancename = 'Resistance-' + tokens[2] fiveprimeoverhangname = 'Overhang-' + tokens[3] threeprimeoverhangname = 'Overhang-' + tokens[4] if len(tokens) > 5: description = tokens[5] else: description = 'From ' + vectorfilename + ': ' + \ fiveprimeoverhangname + ', ' + \ threeprimeoverhangname + ', ' + resistancename directories = [d for d in directories if os.path.isdir(d)] for directory in directories: files = os.listdir(directory) if vectorfilename in files: vectorsequence = read_genbank_file(directory + '/' + vectorfilename) break vector = {} vector['authorid'] = authorid vector['datecreated'] = date vector['description'] = description vector['name'] = vectorname vectorid = uuid.uuid4() vector['idvector'] = vectorid nucseq = {} nucseq['datecreated'] = date nucseq['idnucseq'] = vectorid nucseq['sequence'] = vectorsequence vector['nucseq'] = nucseq vector['iscircular'] = True repo['nucseq'].append(nucseq) repo['vectors'].append(vector) ft2 = {} foundfeature1 = False foundfeature2 = False overhangfeatures = repository.get_features_by_family_name( repo, 'overhang') for feature in overhangfeatures: if not foundfeature1 and feature['name'].upper( ) == fiveprimeoverhangname.upper(): position = repository.get_overhang_position_in_vector( vectorsequence, feature['nucseq']['sequence']) repository.add_feature_to_nucseq(repo, vectorname, nucseq, feature, position, authorid, date) foundfeature1 = True if not foundfeature2 and feature['name'].upper( ) == threeprimeoverhangname.upper(): position = repository.get_overhang_position_in_vector( vectorsequence, feature['nucseq']['sequence']) repository.add_feature_to_nucseq(repo, vectorname, nucseq, feature, position, authorid, date) ft2 = feature foundfeature2 = True if not foundfeature1 or not foundfeature2: raise ValueError( 'The overhangs caused by vector ' + vectorname + ' were not defined in the overhangs manifest.') foundfeature = False for feature in repository.get_features_by_family_name( repo, 'resistance'): if feature['name'].upper() == resistancename.upper(): position = nucseq['sequence'].find( feature['nucseq']['sequence']) repository.add_feature_to_nucseq(repo, vectorname, nucseq, feature, position, authorid, date) foundfeature = True if not foundfeature: overhangpos = repository.get_overhang_position_in_vector( vectorsequence, ft2['nucseq']['sequence']) if overhangpos < 0: raise ValueError('The overhang ' + ft2['name'] + ' could not be found in the vector ' + vectorname) startpos = overhangpos + len(ft2['nucseq']['sequence']) + 1 resistancesequence = nucseq['sequence'][ startpos:len(nucseq['sequence'])] f = repository.create_feature(repo, resistancename, resistancesequence, 'resistance', date) position = nucseq['sequence'].find(f['nucseq']['sequence']) repository.add_feature_to_nucseq(repo, vectorname, nucseq, f, position, authorid, date) repository.add_object_to_collection(repo, collectionid, f['idfeature'], 'FEATURE', authorid, date) repository.add_object_to_collection(repo, collectionid, vectorid, 'VECTOR', authorid, date) lineno += 1