def _learn_tree(self, data, attributes, depth): if not len(attributes) or depth + 1 == self.max_depth: leaf = Node() leaf.set_is_leaf(utils.most_common(data)) return leaf if utils.all_the_same_label(data): leaf = Node() label = data.raw_data[0][0] leaf.set_is_leaf(label) return leaf base_entropy = utils.entropy(data) attribute_name, attribute_ig = utils.best_attribute( data, attributes, base_entropy) attribute = data.attributes[attribute_name] root = Node(attribute_name, attribute.possible_vals, attribute.index) depth += 1 for attribute_value in root.possible_vals: b = Branch(attribute_value) root.add_branch(b) data_sample = data.get_row_subset(attribute_name, attribute_value) if not len(data_sample): leaf = Node() leaf.set_is_leaf(utils.most_common(data)) b.set_child(leaf) else: attributes = utils.remove_attribute(attributes, attribute_name) b.set_child(self._learn_tree(data_sample, attributes, depth)) return root
def measureAccuracy(X, Cindex, n, numberOfClasses): classTrue = [0] * n classPred = [0] * n #get true labels for j in range(0, n): classTrue[j] = X[j]['class'] #pick a label for each class that gives the best results Cindex.sort(key=len) usedLabels = [] for j in range(0, numberOfClasses): temp = [] for i in range(0, len(Cindex[j])): if not X[Cindex[j][i]]['class'] in usedLabels: temp.append(X[Cindex[j][i]]['class']) for i in range(0, len(Cindex[j])): classPred[Cindex[j][i]] = most_common(temp) usedLabels.append(most_common(temp)) #calc presision and recall values result = ratio(classTrue, classPred) print("Precision:", result[0]) print("Recall:", result[1]) return classPred
def recover_derg(self, app_derg): """ recover third party library nodes given an obfuscated derg :param app_derg: :return: """ assert (isinstance(app_derg, DERG)) for package in app_derg.get_packages(): package_derg = app_derg.get_package_derg(package) matched_package, mapping = self.match_3lib_package(package_derg) if matched_package and mapping: matched_package_name = utils.most_common( matched_package['packages']) print("matched third party package: %s" % matched_package_name) matched_derg = matched_package['derg'] for matched_id, node_id in mapping.items(): node = app_derg.g.nodes[node_id] node_type = node['type'] if node_type in STATIC_NODE_TYPES: continue node_type = node_type.split('_')[0] if node_type in ['package', 'class', 'method', 'field']: matched_node = matched_derg.g.nodes[matched_id] node['type'] = node_type + '_3lib' node['recovered_name'] = matched_node['name'] node['recovered_sig'] = matched_node['sig'] print("recovered third party package: %s" % matched_package_name) return app_derg
def fast_match_3lib_package(self, package_derg): for lib_package in self.lib_packages: lib_derg = lib_package['derg'] lib_package_name = utils.most_common(lib_package['packages']) lib_hashes = lib_derg.get_node_hashes() package_hashes = package_derg.get_node_hashes() common_hashes = lib_hashes.intersection(package_hashes) common_count = len(common_hashes) if common_count < 3: continue precision = float(common_count) / len(package_hashes) recall = float(common_count) / len(package_hashes) if precision > 0.9: return lib_package_name return None
def match_3lib_package(self, package_derg, isomorphism_timeout=10): for lib_package in self.lib_packages: lib_derg = lib_package['derg'] lib_package_name = utils.most_common(lib_package['packages']) if not lib_derg.get_node_hashes().issuperset( package_derg.get_node_hashes()): continue GM = isomorphism.DiGraphMatcher( lib_derg.g, package_derg.g, node_match=ThirdPartyLibRepo.node_match, edge_match=ThirdPartyLibRepo.edge_match) try: with utils.timeout(isomorphism_timeout): if GM.subgraph_is_isomorphic(): return lib_package, GM.mapping except: print("graph isomorphism timeout during matching %s" % lib_package_name) return None, None
def run_inference_on_image(self): """ Run inference on images * Creates the graph * Loads the image * Performs inference """ self.detecting = True answer = None #### # STREAM PREDICTIONS ON IMAGES #### # Check if image directory exists if not tf.gfile.Exists(self.image_path): tf.logging.fatal('File does not exist %s', self.image_path) return answer # Get images list file_list = glob.glob(os.path.join(self.image_path, '*.jpeg')) if len(file_list) > 0: answers = [] scores = [] for im_path in file_list: # Load image image_data = tf.gfile.FastGFile(im_path, 'rb').read() # Run inference predictions = self.sess.run(self.softmax_tensor, { 'DecodeJpeg/contents:0': image_data, 'Placeholder_1:0': 1.0 }) predictions = np.squeeze(predictions) # Getting top prediction top_1 = predictions.argsort()[-1:][0] answers.append(self.labels[top_1]) scores.append(predictions[top_1]) # Get most common label most_common_label = utils.most_common(answers) # Get average probability of most common label proba_most_common = utils.proba_most_common( most_common_label, answers, scores) self.detecting = False # Send to Rabbit MQ if proba_most_common > 0.8: self.imageDetected = True if self.enqueueClear == False: print('image detected (label image)') self.sender.obstacleDetection( '{"type": "OBSTACLE","payload": { "obstacle": "' + self.should_stop_train(most_common_label.upper()) + '", "obstacleType": "' + most_common_label.upper() + '"}}') else: self.clear_detection() else: self.send_clear() self._clear_folder(self.image_path) print "Most common label: %s" % most_common_label print "Probability: %s" % proba_most_common time.sleep(2) else: print "No data" time.sleep(5)
notcompletebetters = [] for m in matchdata: for b in betkeys: if b not in m.keys() and b not in notcompletebetters: notcompletebetters.append(b) betkeys = [k for k in betkeys if k not in notcompletebetters] betpreds = [] for m in matchdata: tmp = [] for b in betkeys: tmp.append(m[b]) tmp.append(most_common(tmp))#Calculate book maker prediction as most common prediction among all bookmakers m['BetResult'] = most_common(tmp) tmp.append(m['FTR']) betpreds.append(tmp) data['matchdata'] = matchdata betkeys.append('MajorityBet') betkeys.append('Result') bfile = open("betters.csv", "wb")#Stores the better predictions for each match writer = csv.writer(bfile, quoting=csv.QUOTE_ALL) writer.writerow(betkeys) for tr in betpreds: writer.writerow(tr) bfile.close()
def heuristic_1(fae, sae, och): global output_file_1, output_file_2 funding_address_entity = {k: v for k, v in fae.items()} settlement_address_entity = {k: v for k, v in sae.items()} r = dict() r['n_funding_entities'] = len(set(funding_address_entity.values())) r['n_settlement_entities'] = len(set(settlement_address_entity.values())) r['n_entities'] = len( set(settlement_address_entity.values()).union( set(funding_address_entity.values()))) r['n_addresses'] = len( set(settlement_address_entity.keys()).union( set(funding_address_entity.keys()))) r['n_nodes'] = len(node_channels) funding_address_entity, settlement_address_entity, = \ set_mapping(funding_address_entity, settlement_address_entity, och) # print('Start heuristic 1...') blockstream_funding_txs = read_json(input_file1) # print('use_entities', use_entities) # # mapping between stx and its ftx stx_its_ftx = dict() for channel in channels.values: funding_tx, out_index = channel[0].split(':') funded_address = \ funding_txs[funding_tx]['outputs'][int(out_index)]['address'] settlement_txs = funded_address_settlement_txs[funded_address] if len(settlement_txs) == 1: # it is always zero or one tx stx = settlement_txs[0]['tx_hash'] if stx not in stx_its_ftx: stx_its_ftx[stx] = funding_tx else: print('stx already in dict', stx) # create links for heuristic 1 (both at address and entity level) stx_a_ftx = [] # list of settlement tx, address, funding tx for uftx in blockstream_funding_txs.values(): for i in uftx['vin']: a = i['prevout']['scriptpubkey_address'] prev_tx = i['txid'] if a in settlement_addresses: if prev_tx in settlement_txs_hashes: stx_a_ftx.append([prev_tx, a, uftx['txid']]) # else: # # a is a settlement_address but prev_tx is not a # settlement_tx in our data stx_e_ftx = [] # list of settlement tx, entity, funding tx for uftx in blockstream_funding_txs.values(): for i in uftx['vin']: e = funding_address_entity[i['prevout']['scriptpubkey_address']] prev_tx = i['txid'] if e in settlement_address_entity.values(): if prev_tx in settlement_txs_hashes: stx_e_ftx.append([prev_tx, e, uftx['txid']]) tx_2in1 = '88679369ec778d5187c207676c788e7d22272e64c120e0cd6e06858864bdb5e9:1' # I need a mapping between funding tx and nodes # (ignore case with two funding txs in one tx cause one channel is still open) # and between settlement tx and nodes ftx_nodes = dict() for channel in channels.values: if channel[0] != tx_2in1: ftx = channel[0].split(':')[0] ftx_nodes[ftx] = [channel[1], channel[2]] funded_address_channel = dict() for channel in channels.chan_point.values: hsh, out_index = channel.split(':') funded_address = funding_txs[hsh]['outputs'][int(out_index)]['address'] if funded_address not in funded_address_channel: funded_address_channel[funded_address] = channel else: print(funded_address, ' has multiple channels') stx_nodes = dict() for fa, channel in funded_address_channel.items(): if channel != tx_2in1: stxs = funded_address_settlement_txs[fa] ftx = channel.split(':')[0] if stxs: stx = stxs[0]['tx_hash'] stx_nodes[stx] = ftx_nodes[ftx] # print('Initial number of links addresses', len(stx_a_ftx)) # print('Initial number of links entities', len(stx_e_ftx)) # decide link level: address or entity triplet = stx_a_ftx if use_entities: triplet = stx_e_ftx links = [] # like stx_a_ftx plus 4 nodes of channels for el in triplet: # the funding entity controls the node in common between the channel # opened with ftx and closed with stx stx, a, ftx = el n1, n2 = ftx_nodes[ftx] # happens after the stx n3, n4 = stx_nodes[stx] links.append([stx, a, ftx, n1, n2, n3, n4]) useful_links = [] for link in links: s = set(link[3:]) if len(s) == 3: useful_links.append(link) # if closing of other node in ch1 > opening of other node in ch2 # then we can use the link usable_links = [] for link in useful_links: node_in_common = most_common(link[3:]) other_node_ch1 = '' other_node_ch2 = '' for node in link[3:][::-1]: if node != node_in_common: if not other_node_ch1: other_node_ch1 = node else: other_node_ch2 = node if node_openings_closings[other_node_ch1]['last_activity'] > \ node_openings_closings[other_node_ch2]['first_activity']: usable_links.append(link) reliable_links_addresses = [] for link in usable_links: link_address = link[1] stx = link[0] its_ftx = stx_its_ftx[stx] if link_address in [ el['address'] for el in funding_txs[its_ftx]['inputs'] ]: reliable_links_addresses.append(link) # print('Number of reliable links at address level:', # len(reliable_links_addresses)) reliable_links_entities = [] entities_reusing = set() for link in usable_links: if use_entities: link_entity = link[1] else: link_entity = settlement_address_entity[link[1]] stx = link[0] its_ftx = stx_its_ftx[stx] if link_entity in [ funding_address_entity[el['address']] for el in funding_txs[its_ftx]['inputs'] ]: entities_reusing.add(link_entity) reliable_links_entities.append(link) # print('Number of reliable links at entity level:', # len(reliable_links_entities)) # print('Number of entities reusing funding addresses:', len(entities_reusing)) # step 1: linking nodes to entity using stx and ftx # print('Step 1:') heuristic_1a_entity_node = dict() heuristic_1a_node_entity = dict() for link in reliable_links_entities: if use_entities: e = link[1] else: e = settlement_address_entity[link[1]] n = most_common(link[3:]) if e not in heuristic_1a_entity_node: heuristic_1a_entity_node[e] = set() heuristic_1a_entity_node[e].add(n) if n not in heuristic_1a_node_entity: heuristic_1a_node_entity[n] = set() heuristic_1a_node_entity[n].add(e) # print('Number of entities linked to nodes:', len(heuristic_1a_entity_node)) # print('Number of nodes linked to entities:', len(heuristic_1a_node_entity)) # print('Step 2:') # link other node and entity in channel heuristic_1b_entity_node = link_other_nodes(heuristic_1a_entity_node, channels, funded_address_settlement_txs, funding_txs, settlement_address_entity) heuristic_1b_node_entity = invert_mapping(heuristic_1b_entity_node) # correct means that the settlement tx has exactly two output entities correct_stxs = [] # correct stxs correct_settlement_entities = set() # output entities of correct stxs correct_nodes = set() for channel in channels.values: funding_tx, out_index = channel[0].split(':') node_1 = channel[1] node_2 = channel[2] funded_address = \ funding_txs[funding_tx]['outputs'][int(out_index)]['address'] settlement_txs = funded_address_settlement_txs[funded_address] # if channel is closed and number of outputs == 2 and # one node is mapped to one entity in the outputs if settlement_txs: # it is always only one for settlement_tx in settlement_txs: # count entities entities = set([ settlement_address_entity[out['address']] for out in settlement_tx['outputs'] ]) if len(entities) == 2: correct_stxs.append(settlement_tx) correct_settlement_entities = correct_settlement_entities.union( entities) correct_nodes.add(node_1) correct_nodes.add(node_2) perc_entities_linked_settled = round( 100 * len(heuristic_1b_entity_node) / r['n_settlement_entities'], 2) perc_entities_linked_2e = round( 100 * len(heuristic_1b_entity_node) / len(correct_settlement_entities), 2) perc_nodes_linked_2e = round( 100 * len(heuristic_1b_node_entity) / len(correct_nodes), 2) r = get_results(r, heuristic_1b_entity_node, heuristic_1b_node_entity) print('Number of settlement entities:', r['n_settlement_entities'], '--', perc_entities_linked_settled, '% linked') print( 'Number of settlement entities considering settlement txs with 2 output entities:', len(correct_settlement_entities), '--', perc_entities_linked_2e, '% linked') print('Number of nodes considering settlement txs with 2 output entities:', len(correct_nodes), '--', perc_nodes_linked_2e, '% linked') addresses_linked = set() for address_entity in [funding_address_entity, settlement_address_entity]: for address, entity in address_entity.items(): if entity in heuristic_1b_entity_node: addresses_linked.add(address) r['perc_addresses_linked'] = round( 100 * len(addresses_linked) / r['n_addresses'], 2) output_file_a = output_file_1 output_file_b = output_file_2 if och['stars']: output_file_a = output_file_4 output_file_b = output_file_5 if och['none']: output_file_a = output_file_6 output_file_b = output_file_7 if och['snakes']: output_file_a = output_file_10 output_file_b = output_file_11 # if och['collectors']: # output_file_a = output_file_8 # output_file_b = output_file_9 if och['proxies']: output_file_a = output_file_8 output_file_b = output_file_9 if och['all']: output_file_a = output_file_1 output_file_b = output_file_2 # Write to file heuristic_1_entity_node = { str(k): [e for e in v] for k, v in heuristic_1b_entity_node.items() } heuristic_1_node_entity = { k: [int(e) for e in v] for k, v in heuristic_1b_node_entity.items() } print(och) print('writing to', output_file_a, output_file_b) write_json(heuristic_1_entity_node, output_file_a) write_json(heuristic_1_node_entity, output_file_b) return r
def main(): sequences, total_records = utils.readGenome('dna4.fasta') # question 1: total records in the file print total_records # answer: 22 # question: longest sequence in the file? lengths = [len(i) for i in sequences] print(max(lengths)) # answer: 4815 print(min(lengths)) # answer 40 # question: what is the length of the longest ORF appearing in reading frame 1 of any of the sequences? max_orf = 0 for i in sequences: cur_seq = utils.find_orf(i, 0) for j in cur_seq: if len(j) > max_orf: max_orf = len(j) print max_orf # answer 1767 # question: what is the length of the longest ORF appearing in any sequence and in any forward reading frame? max_orf = 0 for i in sequences: cur_seq = utils.find_orf(i, 1) for j in cur_seq: if len(j) > max_orf: max_orf = len(j) print max_orf # answer 1770 # question: what is the length of the longest forward ORF that appears in the sequence with the identifier gi|142022655|gb|EQ086233.1|349? test_orf = "GATCGCCGCCTGGGTTGTCGAGACACCTGCGCGTGCGCGTCGAACGAAACACCTTGACCCACCGTATGCC CGGCACCGCGCGCGTCCCGGCCGACCTCGCGACACCGAGCGGCACCGCTTCGAAGCATTCTAGCCGGCTC GCGCTTCGCGAACCACCTTTTCGGACGAAAATCCGCACGTTGAATCACTTTCCTGCTTCGTATTTCACGC AAACTGCGTACAATCCTGAGACAACAGTACGTCAACTTCAGGAGAGCAACGATGCCCCCTCGCAAGGATC GCGATACGCCCCATCGCTATCGCAGCGGCGAGGCCGCGCGCCTGGCGCGCATGCCGGCAGCCACGCTGAG AATCTGGGAACGGCGCTATGGCGTGGTTGCGCCGCCCAAAACGCCGTCCGGACAACGGCTGTACTCGGAC GACGACGTGCAGCGCATTCGATTGCTGAAAACGCTCGTCAATCAGGGCCACGCGATCGGGTCGATCGCCA GCCTGAGCCGCGAGGAACTCGAGGCGTTGTCGTTGACGAATACGCGTGACCCGGCGTTTCACGAGGCAAG TGTGAGCCTCGCGGTCGTCGGCGCGCTTTCGATTCCGGAAGCCGCGATCGAGCGAATGGGAATCCGGATC GCCGCGCGAATCGACTCGCTCGACGACACGAGCGCGCATGCGGGTACGTCGGTCGATGCCCTCATCGCGA CGACCACGTCGCTCCATGAGGATGTCGTTTCGCAGCTCGCTGCCCAGGCGCAACAGCTCAACGCGCACGC CGTGGCCGTCGTATACGGGTTCGGCACGGCAGAAGCGGTCGAGCTGGCGCGTCTGTCGGGGTTCGAGCTG TTCCGGTCGACGGAAGGCCAGACCAACCCGATATCGATCATTTCGAAACTGGCGCAAGCCGTCGTCAAGT CGCGCCAATCGAATGACGCGGATCGCGGGCTCTGGCTGCGCACGCGGCGACGCTTCGACGAGGCGACGCT CGCGTCGCTCAGCGGCCTGTCCACCACCGTCAAATGCGAGTGTCCGCGTCACCTCTCCGAATTGATCATG CAGCTCAGTGCGTTCGAGCGATACAGCGACGAATGCGTGTCGCGATCGCCGGCCGATGCGCTGCTGCACC GCCACCTTGGAGACGCAGCGAACCGGGCAGCCGAATTGCTCGAGACGGCGCTTGCCGTCATTCTCCGCGA AGAGGGATTGGGCGGGACGACGCCGGAACTGAAGGCGCTGTAGCGCGGCACGCGCCGCCGGCTGTTCCGA CCTGCCGACGACGGCAGGTGGCGATGCTCTTTCGCGTGCAATGCAGGGCTTGCGTCGATCACTGAGCCGA AACGGAAGAACGAGCCGCTGCGGCAGGCGATGCCGGCGGCCTGCCCGTGGTTCCGGCATTCGACGCATGC GCGACTCGATCCACGAACGCGGAGAGATCGTCGAGCACTGACGCCATCCCTCTCAACGGCGCGCCCAGAA CACCGACGATCGACGCATGGCCCACGTTCCGGTAACGCATGACCACGACGGTGTCGCCCTTCTCTTGCAA TGCTCGCGCGAAGCGGGTCGTGTTGCCGGGCTCGACCACGGTGTCGTTCTCCGCGGTGGCCAGCCACATC GGCGGCTCCGTACCCTGAATGAACCGGATGGGCTGGCTCGCGGCCCGCACTTCCTGCGGGAATATCCTTT CAAGCGTGGTATCGCGCAGCGGCAGGAAATCATAAGCCCCGGCCAGGCCAATCACGCCGGCGATATCGCT CTTCCGCATCGCCTGTGCCGCCAGATAGCGGCCGTCGGTCGCAAGCAATGCGGCAATCTGCGCGCCCGCG GAATGCCCCATCAGAAACAGGCGATGTGGATCGCCGCCGAACGCAACCGCGTGCTCGCGTGCCCACGCGA CCGCCTGCGCCGCATCGTCGACGAAACCGGGAAAGGTGGTCGCCGGATACGTCCTGTAGTCGGGTAAGAC GGCAACAAAGCCCCTCGACGCGAGCGCCTCTCCCACGAACAGATAGTCCTTGCGCTCGCCGGACTGCCAG CTTCCGCCGTAAAGGAACACGACCACAGGGGCGCCCGCACTCGCATCGGCCGGCCAGTGATGCAAGACGC GCGTGGGCAAATAGACGTCGAGCACCTGGCGTTCGCCGGATCCGTACGGGATACCTGCGAACAGACTGAA CGTGTAGCTCGGCGTCAGCGCATTCAGGAGCCGCACCGGGCTGCACGCGGAGAGGAGACCGGCCGCGAGC AGCACCGACAGGACGACAAGCCCGGCTTTCATGTTCATGGAGATCCCCATTCCTGACGATTCCGGCCGCA TCCGCCGCCTGGTACGAGGTTTACGGCGCTTGCGCGCAAGCGGATGCACGCATCGCATGGCAACCGCGCC CCTTGACGGCATCCAGATCTTTCCTGCGCAAGTGCATCCGTCCGCAACGGAGAGTCGTATGTGAATGGAT AGGTGAATCAACGCGGAATGCCGACCATCGCTCGCTGCAAAGCAATCGTCCGGTGGCGAGTCCGCTCGTC GACGATAGTGAGAGCCGTCTGCCATGAGCGTTCTACCTGCCACTTACCCCGAGATGCAACGTCGACGTGG CGGCACGGCGACCATGCCGTTACCGATGATCCCGCGCGAACGATCATGAGGAGCGCGCCGAATCAACTGA CGTCGAGCACGCAAAAGTCCGGCGCCGCTCGCGTGTACGTCTATCTCGCGACGACCCAGACAGGATGGCT GGTATGCGTGATGACTGCCGCAGCGCATCACGCCGCGTGGGGCGTCACCTATGCGCTGATCGCGACAGCG GGCCATCTTCTCTTCGCGCGTCGGCCCGCATCCGAGGCGCGGATCGTCATCACGGTCACGGTGTCCGGAT GGTTATGGGACAGCGCCGTTGCACATTCCGGCCTGCTCGTGTACCCGAACGGCGTTTTTCTCAAAGGTAC AGCGCCGTACTGGCTCGCGGGGCTGTGGGCGCTGTTCGCGATTCAACTCAACACCTTGCTGCTCTGGCTT CGGGCGCGACCGCTCGTCTCGGCGCTCGTCGGCGCATTCGCAGGCCCCGCATCCTTTCGCGCAGGTGCGG CGCTGGGGGCCGTTCATTTCAAAGACTCGGCTGCAGCGCTCGTCGTTCTCGCAACCGGCTGGGCGTTCAT CTTGCCGGCCGCGCTTGCGATTGCAAGCCATTGGGATGGCGTAACGCCCCCTTCTCCTCCGCCAATCGGC GCAGGCGACATGAATGACGCCCGCGCCGGATAGAGCCGGACGCGTCGTAAGCCAGCGTTATCTCCGATCC CGTTCAAATTGCCAACGTACCTTCTCAGGCACCACACATGACACGCACCGAATTGCCGTATGAATCCCGC CCCGTTATCGTATGGTTTCGGGATGACCAACGACTCAGCGACAATCCCGCACTCTCTCATGCGGTCAGTA CCGGCCATCCTGTTGTTTGCGTCTACGTCTACGACCCTGCCCCGAAGCTCGGGCGCGCCATGGGGGGCGC GCAGAAGTGGTGGCTGCACGAGTCGTTGAAAAAACTCGACGACTCGCTTTCCGCTCTCGGCGGCTCGCTG CTCGTGCTTCGCGGTAACGAACACGAAGCCATCAGGAGCCTCGCCGTCGAGACCCGGGCGGCAATGGTTT TCTGGAATCGCCGCTACTCGAAAGCGCAAACGGAAATGGATGCATCGATCAAGAAAGACCTGATCGGGCG CGGCATCGACGTGTCGACATTCAATGGCCATCTTTTGCGCGAACCCTGGACAGTGGCCACGCGCGAAGGC TTGCCGTTCCAGGTATTCAGCGCGTACTGGAGAGCCGCTCGCCGCGATAATTTTTTCCCGCCGTGCCCAC TGTCGGCGCCCGCCCGGGTCACGTTCTTTCCCGTCTCCAGAAACGTCAGCGCACACGTCTGTACGCTTCC CGCGCTTGCACTGCAGCCCTCGACGCCGGACTGGGCGGAGGGCCTGCGTGCAACCTGGCGATGCGGCGAG GAAGCGGCCGGGCATCAACTCGAGGCCTTCATTGAACACTCGTTTTCCGACTATGCCGGCGCTCGAGATT TTCCGGCCACTCGAGCGACGAGCCGGCTCTCTCCGTATCTTCGCTTCGGAAATATCTCGGCCCGGCAGGT GTGGTACGCGACGTTATCAGCGGTAGACGCGATGCGAAGCAGGCGAGTTGTTCGCATTGACGATGCCAAA AATGAGTCGTTGAACAAGTTCTTCAGTGAACTCGGATGGAGAGAATTCTCGTATTACCTTCTTTACCACT GCGAACCCCTTCATCAGGTCAATTTCCGGCGTCAGTTTGACGCCATGCCGTGGCGTACCGACGCCAAGGC GCTTCGCGCGTGGCAAAGGGGGAAAACAGGATACCCGCTGGTCGACGCCGGCATGCGCGAGCTTTGGCAC ACGGGCTGGATGCACAACCGCGTGCGCATGGTGACAGCGTCATTTCTCACCAAGCACTTGCTGATCGACT GGCGCGAGGGCGAAGCATGGTTCTGGGATACGCTGGTTGACGCG" max_orf = 0 cur_seq = utils.find_orf(test_orf, 0) for j in cur_seq: if len(j) > max_orf: max_orf = len(j) cur_seq = utils.find_orf(test_orf, 1) for j in cur_seq: if len(j) > max_orf: max_orf = len(j) cur_seq = utils.find_orf(test_orf, 2) for j in cur_seq: if len(j) > max_orf: max_orf = len(j) print max_orf # 972 or 975 # q: find the most frequently occurring repeat of length 6 in all sequences. How many times does it occur in all? all_repeats = [] for i in sequences: repeats_list = utils.get_all_repeats(i,6) for j in repeats_list: all_repeats.append(j) print(all_repeats.count(utils.most_common(all_repeats))) # answer 208 # q: find all repeats of length 11 in the input file. Let's use Max to specify the number of copies of the most frequent repeat of length 11. How many different 11-base sequences occur Max times? all_repeats = [] for i in sequences: repeats_list = utils.get_all_repeats(i,11) for j in repeats_list: all_repeats.append(j) print Counter(all_repeats).most_common(10) # answer: 5 seq1=0 seq2=0 seq3=0 seq4=0 for i in sequences: repeats_list = utils.get_all_repeats(i,7) for j in repeats_list: if j == 'CGGCGCG': seq1 +=1 if j == 'CGGCACG': seq2 +=1 if j == 'GCGGCAC': seq3 +=1 if j == 'TCGGCGG': seq4 +=1 print str(seq1) + " | " + str(seq2) + " | " + str(seq3) + " | " + str(seq4) + " | "