def store_results( transcript_name, minimum_CG, maximum_CG, maximum_offtarget, scaffold, immuno, results ): db_results = [ Result( score=result_dict['score'], shmir=result_dict['frame'].template(), pdf=result_dict['folding']['path_id'], backbone=result_dict['frame'].id, sequence=result_dict['found_sequence'], ) for result_dict in results ] db_input = InputData( transcript_name=transcript_name, minimum_CG=minimum_CG, maximum_CG=maximum_CG, maximum_offtarget=maximum_offtarget, scaffold=scaffold, immunostimulatory=immuno, results=db_results ) db_session.add(db_input) db_session.add_all(db_results) db_session.commit() return db_results
def seed_initial_data(): backbones = [ Backbone( name='miR-30a', flanks3_s='TGCCTACTGCCTCGGACTTCAAGGGGCTACTTTAGGAGCA', flanks3_a='TGCTCCTAAAGTAGCCCCTTGAAGTCCGAGGCAGTAGGCA', flanks5_s='CTAAAGAAGGTATATTGCTGTTGACAGTGAGCGAC', flanks5_a='GTCGCTCACTGTCAACAGCAATATACCTTCTTTAG', loop_s='CTGTGAAGCCACAGATGGG', loop_a='CCCATCTGTGGCTTCACAG', miRNA_s='UGUAAACAUCCUCGACUGGAAG', miRNA_a='CTTCCAGTCGAGGATGTTTGCAGC', miRNA_length=22, miRNA_min=19, miRNA_max=25, miRNA_end_5=-2, miRNA_end_3=0, structure='./data/structures/miR-30a', homogeneity=4, miRBase_link=('http://www.mirbase.org/cgi-bin/mirna_entry.pl' '?acc=MI0000088'), active_strand=3 ), Backbone( name='miR-155', flanks3_s=('GTGTATGATGCCTGTTACTAGCATTCACATGGAACAAATTGCTGCTGCCGTGGG' 'AGGATGACAAAGA'), flanks3_a=('TCTTTGTCATCCTCCCACGGCAGCAGCAATTTGTTCCATGTGAATGCTAGTAAC' 'AGGCATCATACAC'), flanks5_s='AGGCTTGCTGTAGGCTGTATGCTG', flanks5_a='CAGCATACCTACAGCAAGCCT', loop_s='TTTTGCCTCCAACTGA', loop_a='TCAGTTGGAGGCAAAA', miRNA_s='UUAAUGCUAAUCGUGAUAGGGGU', miRNA_a='CUCCUACAUAUUAGCAUUAACA', miRNA_length=23, miRNA_min=20, miRNA_max=26, miRNA_end_5=-2, miRNA_end_3=1, structure='./data/structures/miR-155', homogeneity=5, miRBase_link=('http://www.mirbase.org/cgi-bin/mirna_entry.pl' '?acc=MIMAT0000646'), active_strand=5 ), Backbone( name='miR-21', flanks3_s='CTGACATTTTGGTATCTTTCATCTGACCATCCATATCCAATGTTCTCATT', flanks3_a='AATGAGAACATTGGATATGGATGGTCAGATGAAAGATACCAAAATGTCAG', flanks5_s='TACCATCGTGACATCTCCATGGCTGTACCACCTTGTCGGG', flanks5_a='CCCGACAAGGTGGTACAGCCATGGAGATGTCACGATGGTA', loop_s='CTGTTGAATCTCATGG', loop_a='CCATGAGATTCAACAG', miRNA_s='UAGCUUAUCAGACUGAUGUUGA', miRNA_a='CAACACCAGUCGAUGGGCUGU', miRNA_length=22, miRNA_min=19, miRNA_max=24, miRNA_end_5=-1, miRNA_end_3=1, structure='./data/structures/miR-21', homogeneity=4, miRBase_link=('http://www.mirbase.org/cgi-bin/mirna_entry.pl' '?acc=MI0000077'), active_strand=5 ), Backbone( name='miR-122', flanks3_s='GCTACTGCTAGGCAATCCTTCCCTCGATAAATGTCTTGGCATCGTTTGCTT', flanks3_a='AAGCAAACGATGCCAAGACATTTATCGAGGGAAGGATTGCCTAGCAGTAGC', flanks5_s='TGGAGGTGAAGTTAACACCTTCGTGGCTACAGAGTTTCCTTAGCAGAGCTG', flanks5_a='CAGCTCTGCTAAGGAAACTCTGTAGCCACGAAGGTGTTAACTTCACCTCCA', loop_s='TGTCTAAACTATCA', loop_a='TGATAGTTTAGACA', miRNA_s='UGGAGUGUGACAAUGGUGUUUG', miRNA_a='AACGCCAUUAUCACACUAAAUA', miRNA_length=22, miRNA_min=21, miRNA_max=23, miRNA_end_5=-2, miRNA_end_3=2, structure='./data/structures/miR-122', homogeneity=5, miRBase_link=('http://www.mirbase.org/cgi-bin/mirna_entry.pl' '?acc=MI0000442'), active_strand=5 ), Backbone( name='miR-31', flanks3_s='CTTTCCTGTCTGACAGCAGCTTGGCTACCTCCGTCCTGTTCCTCCTTGTCTT', flanks3_a='AAGACAAGGAGGAACAGGACGGAGGTAGCCAAGCTGCTGTCAGACAGGAAAG', flanks5_s='CATAACAACGAAGAGGGATGGTATTGCTCCTGTAACTCGGAACTGGAGAGG', flanks5_a='CCTCTCCAGTTCCGAGTTACAGGAGCAATACCATCCCTCTTCGTTGTTATG', loop_s='GTTGAACTGGGAACC', loop_a='GGTTCCCAGTTCAAC', miRNA_s='AGGCAAGAUGCUGGCAUAGCU', miRNA_a='UGCUAUGCCAACAUAUUGCCAU', miRNA_length=21, miRNA_min=19, miRNA_max=23, miRNA_end_5=-1, miRNA_end_3=1, structure='./data/structures/miR-31', homogeneity=4, miRBase_link=('http://www.mirbase.org/cgi-bin/mirna_entry.pl' '?acc=MI0000089'), active_strand=5 ), Backbone( name='miR-26a', flanks3_s='GGGACGC', flanks3_a='GCGTCCC', flanks5_s='GTGGCCTCG', flanks5_a='CGAGGCCAC', loop_s='GTGCAGGTCCCAATGGG', loop_a='CCCATTGGGACCTGCAC', miRNA_s='UUCAAGUAAUCCAGGAUAGGCU', miRNA_a='CCUAUUCUUGGUUACUUGCACG', miRNA_length=22, miRNA_min=21, miRNA_max=23, miRNA_end_5=-2, miRNA_end_3=2, structure='./data/structures/miR-26a', homogeneity=4, miRBase_link=('http://www.mirbase.org/cgi-bin/mirna_entry.pl' '?acc=MI0000083'), active_strand=5 ), Backbone( name='miR-106b', flanks3_s='TCCAGCAGG', flanks3_a='CCTGCTGGA', flanks5_s='CCTGCCGGGGC', flanks5_a='GCCCCGGCAGG', loop_s='AGTGGTCCTCTCCGTGCTA', loop_a='TAGCACGGAGAGGACCACT', miRNA_s='UAAAGUGCUGACAGUGCAGAU', miRNA_a='CCGCACUGUGGGUACUUGCUGC', miRNA_length=21, miRNA_min=20, miRNA_max=22, miRNA_end_5=-3, miRNA_end_3=2, structure='./data/structures/miR-106b', homogeneity=2, miRBase_link=('http://www.mirbase.org/cgi-bin/mirna_entry.pl' '?acc=MI0000734'), active_strand=0 ) ] immunos = [ Immuno( sequence='UGUGU', receptor='TLR7 and TLR8', link='http://www.ncbi.nlm.nih.gov/pubmed/16609928' ), Immuno( sequence='GUCCUUCAA', receptor='TLR7 and TLR8', link='http://www.ncbi.nlm.nih.gov/pubmed/15723075' ), Immuno( sequence='GU', receptor='TLR7 and TLR8', link='http://www.ncbi.nlm.nih.gov/pubmed/16609928' ), Immuno( sequence='AU', receptor='TLR8', link='http://www.ncbi.nlm.nih.gov/pubmed/18322178' ), Immuno( sequence='UGGC', receptor='', link='http://www.ncbi.nlm.nih.gov/pubmed/16682561' ), Immuno( sequence='UUUUU', receptor='', link='http://www.ncbi.nlm.nih.gov/pubmed/15778705' ) ] if db_session.query(Utr).count() == 0: filename = download.download_utr_database() for sequence, reference in parse_utr_database(filename): db_session.add( Utr( sequence=sequence, reference=reference ) ) if db_session.query(HumanmRNA).count() == 0: filename = download.download_human_all_database() for sequence, reference in parse_mRNA_database(filename): db_session.add( HumanmRNA( sequence=sequence, reference=reference ) ) if db_session.query(Backbone).count() == 0: db_session.add_all(backbones) if db_session.query(Immuno).count() == 0: db_session.add_all(immunos) db_session.commit()
def shmir_from_transcript_sequence( transcript_name, minimum_CG, maximum_CG, maximum_offtarget, scaffold, stimulatory_sequences ): """Generating function of shmir from transcript sequence. Args: transcript_name(str): Name of transcipt. minimum_CG(int): Minimum number of 'C' and 'G' nucleotide in sequence. maximum_CG(int): Maximum number of 'C' and 'G' nucleotide in sequence. maximum_offtarget(int): Maximum offtarget. scaffold(str): Name of frame of miRNA or 'all'. stimulatory_sequences(str): One of 'yes', 'no', 'no_difference'. Returns: list of sh-miR(s). """ # check if results are in database try: stored_input = db_session.query(InputData).filter( func.lower(InputData.transcript_name) == transcript_name.lower(), InputData.minimum_CG == minimum_CG, InputData.maximum_CG == maximum_CG, InputData.maximum_offtarget == maximum_offtarget, func.lower(InputData.scaffold) == scaffold.lower(), func.lower( InputData.stimulatory_sequences ) == stimulatory_sequences.lower() ).outerjoin(InputData.results).one() except NoResultFound: pass else: return [result.as_json() for result in stored_input.results] # create path string path = "_".join( map( str, [transcript_name, minimum_CG, maximum_CG, maximum_offtarget, scaffold, stimulatory_sequences] ) ) mRNA = ncbi_api.get_mRNA(transcript_name) if scaffold == 'all': original_frames = db_session.query(Backbone).all() else: original_frames = db_session.query(Backbone).filter( func.lower(Backbone.name) == scaffold.lower() ).all() frames_by_name = {frame.name: frame for frame in original_frames} patterns = { frame.name: OrderedDict( sorted( json.loads(frame.regexp).items(), reverse=True ) ) for frame in original_frames } best_sequences = defaultdict(list) for name, patterns_dict in patterns.iteritems(): for regexp_type, sequences in find_by_patterns(patterns_dict, mRNA).iteritems(): with allow_join_result(): is_empty, sequences = generator_is_empty(sequences) if not is_empty: best_sequences[name] = remove_none( group( validate_and_offtarget.s( sequence, maximum_offtarget, minimum_CG, maximum_CG, stimulatory_sequences, int(regexp_type) ).set(queue="blast") for sequence in sequences ).apply_async().get() ) results = [] for name, seq_dict in unpack_dict_to_list(best_sequences): if len(results) == 20: break with allow_join_result(): shmir_result = shmir_from_fasta_string.s( seq_dict['sequence'], [frames_by_name[name]], seq_dict['offtarget'], seq_dict['regexp'], path ).set(queue="score").apply_async().get() if shmir_result: results.extend(shmir_result) if not results: best_sequences = [] sequences = all_possible_sequences(mRNA, 19, 21) with allow_join_result(): is_empty, sequences = generator_is_empty(sequences) if not is_empty: best_sequences = remove_none( group( validate_and_offtarget.s( sequence, maximum_offtarget, minimum_CG, maximum_CG, stimulatory_sequences, 0 ).set(queue="blast") for sequence in sequences ).apply_async().get() ) if best_sequences: with allow_join_result(): results = chain(*remove_none( group( shmir_from_fasta_string.s( seq_dict['sequence'], original_frames, seq_dict['offtarget'], seq_dict['regexp'], path ).set(queue="score") for seq_dict in best_sequences ).apply_async().get() )) sorted_results = sorted( results, key=operator.itemgetter(0), reverse=True )[:10] db_results = [Result( score=score, sh_mir=shmir, pdf=path_id, backbone=frames_by_name[frame_name].id, sequence=found_sequences[0], ) for score, shmir, frame_name, path_id, found_sequences in sorted_results] remove_bad_foldings(path, (result.get_task_id() for result in db_results)) db_input = InputData( transcript_name=transcript_name, minimum_CG=minimum_CG, maximum_CG=maximum_CG, maximum_offtarget=maximum_offtarget, scaffold=scaffold, stimulatory_sequences=stimulatory_sequences, results=db_results ) db_session.add(db_input) db_session.add_all(db_results) db_session.commit() return [result.as_json() for result in db_results]