def generate_gene_table(table_path, name): loader = GeneTableLoader() table = loader.createTable(name, loader.loadUnprocessed(table_path)) rows = [] for key, value in table: start, end, strand = key.split(Table.ID_DELIMITER)[:3] row = [value["unique_id"], value["name"], value["blattner-id"], value["unique-id"], int(start), int(end), strand] synonyms = [""] * 4 for index, name in enumerate(value["other_names"]): synonyms[index] = name row.extend(synonyms) rows.append(row) # for row in rows: # print row db = MySQLdb.connect(host="localhost",user="******",db="article_refactor_24_3_2016") cur = db.cursor(MySQLdb.cursors.DictCursor) key_list = ["unique_id", "name", "blattner_id", "gene_unique_id", "start", "end", "strand", "synonym_1", "synonym_2", "synonym_3", "synonym_4"] fields = ", ".join("%s VARCHAR(200)" % key for key in key_list) cur.execute("CREATE TABLE %s (%s)" % (table.get_name(), fields)) for row in rows: values = ",".join("%s" % db.literal(str(val)) for val in row ) cur.execute("INSERT INTO %s VALUES (%s)" % (table.get_name(), values)) db.commit()
def generate_gene_table(table_path, name): loader = GeneTableLoader() table = loader.createTable(name, loader.loadUnprocessed(table_path)) rows = [] for key, value in table: start, end, strand = key.split(Table.ID_DELIMITER)[:3] row = [ value["unique_id"], value["name"], value["blattner-id"], value["unique-id"], int(start), int(end), strand ] synonyms = [""] * 4 for index, name in enumerate(value["other_names"]): synonyms[index] = name row.extend(synonyms) rows.append(row) # for row in rows: # print row db = MySQLdb.connect(host="localhost", user="******", db="article_refactor_24_3_2016") cur = db.cursor(MySQLdb.cursors.DictCursor) key_list = [ "unique_id", "name", "blattner_id", "gene_unique_id", "start", "end", "strand", "synonym_1", "synonym_2", "synonym_3", "synonym_4" ] fields = ", ".join("%s VARCHAR(200)" % key for key in key_list) cur.execute("CREATE TABLE %s (%s)" % (table.get_name(), fields)) for row in rows: values = ",".join("%s" % db.literal(str(val)) for val in row) cur.execute("INSERT INTO %s VALUES (%s)" % (table.get_name(), values)) db.commit()
def addDirectionalityToPdfTables(): loader = GeneTableLoader() geneTable = loader.createTable("geneDB", loader.loadUnprocessed("our_files/genes.col")) pdfLoader = PDFTableLoader() # handling table 2 (not from supplementary) result = pdfLoader.loadUnprocessed("./parsed_files/2.table") # match for each entry in the in s5 table a strand for entry in result: id, data = geneTable.findByField(Table.NAME_FIELD, entry["name"]) strand = None # if there is no data look in the old names if (id == None): id, data = geneTable.findByOtherNames(entry["name"]) if (data != None): strand = id.split(Table.ID_DELIMITER)[2] entry[TableGlobals.FIRST_STRAND_KEY] = strand entry[TableGlobals.SECOND_STRAND_KEY] = strand pdfTable = pdfLoader.createTable("dummy", result) pdfTable.dump("./final_format/2_directed.table") # S5 handling result = pdfLoader.loadUnprocessed("./parsed_files/s5.table") # match for each entry in the in s5 table a strand for entry in result: id, data = geneTable.findByField(Table.NAME_FIELD, entry["name"]) strand = None # if there is no data look in the old names if (id == None): id, data = geneTable.findByOtherNames(entry["name"]) if (data != None): strand = id.split(Table.ID_DELIMITER)[2] entry[TableGlobals.FIRST_STRAND_KEY] = strand entry[TableGlobals.SECOND_STRAND_KEY] = strand pdfTable = pdfLoader.createTable("dummy", result) pdfTable.dump("./final_format/s5_directed.table") # S8 handling result = pdfLoader.loadUnprocessed("./parsed_files/s8.table") # match for each entry in the in s5 table a strand for entry in result: id, data = geneTable.findByField(Table.NAME_FIELD, entry["name"]) strand = None # if there is no data look in the old names if (id == None): id, data = geneTable.findByOtherNames(entry["name"]) if (data != None): print "[warning] using by old name" strand = id.split(Table.ID_DELIMITER)[2] entry[TableGlobals.FIRST_STRAND_KEY] = strand entry[TableGlobals.SECOND_STRAND_KEY] = strand pdfTable = pdfLoader.createTable("dummy", result) pdfTable.dump("./final_format/s8_directed.table") #S7 handling result = pdfLoader.loadUnprocessed("./parsed_files/s7.table") for entry in result: strand = entry.pop("strand") if (strand == "f"): strand = Table.POS_STRAND elif (strand == "r"): strand = Table.NEG_STRAND entry[TableGlobals.FIRST_STRAND_KEY] = strand entry[TableGlobals.SECOND_STRAND_KEY] = entry[TableGlobals.FIRST_STRAND_KEY] pdfTable = pdfLoader.createTable("dummy", result) pdfTable.dump("./final_format/s7_directed.table") # S6 handling result = pdfLoader.loadUnprocessed("./parsed_files/s6.table") pdfLoader.createTable("dummy", result).dump("./final_format/s6_directed.table")
def lybecker_update(file_name, show_warnings=True, overlap_delimiter="/", overlap_field="annotation of overlapping genes", adjacent_field="adjacent genes", loader_type=LybeckerS2TableLoader): geneLoader = GeneTableLoader() gene_table = geneLoader.createTable("genes", geneLoader.loadUnprocessed("./genes.col")) loader = loader_type() table = loader.createTable("lybecker", loader.load("lybecker/final/%s" % file_name)) new_table_raw = [] for id, info in table: dct = {} info.pop(Table.UNIQUE_ID_FIELD) start, end, strand = id.split(";")[:3] # print info["name"] start = int(start) end = int(end) result = gene_table.is_overlaps(start, end, "none") if show_warnings: print id overlaps = [gene for gene in info[overlap_field].split(overlap_delimiter) if gene != ""] print overlaps for index, gene in enumerate(result): print "%d: %s" % (index, gene[1][1]["name"]) if gene[1][1]["name"] not in overlaps: print "[warning] older name is being used" if len(overlaps) > len(result): print "[warning] missing overlapping gene" if len(overlaps) < len(result): print "[warning] extra overlapping gene" # Set the record location dct[TableGlobals.FIRST_START_BASE_KEY], dct[TableGlobals.FIRST_END_BASE_KEY], \ dct[TableGlobals.FIRST_STRAND_KEY], dct[TableGlobals.SECOND_START_BASE_KEY], \ dct[TableGlobals.SECOND_END_BASE_KEY], dct[TableGlobals.SECOND_STRAND_KEY] = id.split(Table.ID_DELIMITER) # Assume unknown strand dct[TableGlobals.FIRST_STRAND_KEY] = "none" dct[TableGlobals.SECOND_STRAND_KEY] = "none" is_valid = result[0][0] overlap_names = info[overlap_field].split(overlap_delimiter) # Check if major in overlapping names if not is_valid: is_valid = False # print "major names", [gene[1][1]["name"] for gene in result] for first in [gene[1][1]["name"] for gene in result]: for second in overlap_names: if first in second: is_valid = True break if is_valid: break # check if minor in overlapping names if not is_valid: is_valid = False other_names = [] for gene in result: other_names.extend(gene[1][1]["other_names"]) # print "old names", other_names for first in other_names: for second in overlap_names: if first in second: is_valid = True break if is_valid: break if "intergenic" == info["category"]: is_valid = False adjacent_genes = [val for val in info[adjacent_field].split(overlap_delimiter) if val != ""] first = gene_table.findByName(adjacent_genes[0]) second = gene_table.findByName(adjacent_genes[1]) if first == (None, None): first = gene_table.findByOtherNames(adjacent_genes[0]) if second == (None, None): second = gene_table.findByOtherNames(adjacent_genes[1]) if first != (None, None): representing = first elif second != (None, None): representing = second else: raise BaseException("No presenting gene found") strand = representing[0].split(Table.ID_DELIMITER)[2] if strand == TableGlobals.STRAND_POSITIVE: dct[TableGlobals.FIRST_STRAND_KEY] = TableGlobals.STRAND_NEGATIVE dct[TableGlobals.SECOND_STRAND_KEY] = TableGlobals.STRAND_NEGATIVE else: dct[TableGlobals.FIRST_STRAND_KEY] = TableGlobals.STRAND_POSITIVE dct[TableGlobals.SECOND_STRAND_KEY] = TableGlobals.STRAND_POSITIVE if is_valid: is_valid = "divergent" not in info["category"] and \ "convergent" not in info["category"] # Update the record name if gene was found if is_valid: info["name"] = "overlapping_" else: name_id = id.split(Table.ID_DELIMITER)[:2] name_id.append(dct[TableGlobals.FIRST_STRAND_KEY]) info["name"] = "lybecker_%s_%s" % (info["category"], Table.ID_DELIMITER.join(name_id)) pos_count = 0 neg_count = 0 if is_valid: # for each gene match for entry in result: # exact match add name info["name"] += "%s." % entry[1][1]["name"] strand = entry[1][0].split(Table.ID_DELIMITER)[2] if TableGlobals.STRAND_NEGATIVE == strand: neg_count += 1 if TableGlobals.STRAND_POSITIVE == strand: pos_count += 1 if neg_count == 0: dct[TableGlobals.FIRST_STRAND_KEY] = TableGlobals.STRAND_NEGATIVE dct[TableGlobals.SECOND_STRAND_KEY] = TableGlobals.STRAND_NEGATIVE elif pos_count == 0: dct[TableGlobals.FIRST_STRAND_KEY] = TableGlobals.STRAND_POSITIVE dct[TableGlobals.SECOND_STRAND_KEY] = TableGlobals.STRAND_POSITIVE else: print "no strand match: %s" % entry[1][1]["name"] # remove extra . from name if match found if is_valid: info["name"] = info["name"][:-1] dct.update(info) new_table_raw.append(dct) if show_warnings: print 20 * "*" # for row in new_table_raw: # print row TableLoader().createTable("updated_lybecker", new_table_raw).dump("lybecker/final/updated_%s" % file_name)
def printMatchAnalysis(): """ This function is used to print the matched directions for the know rna from the pdf so we can make sure the direction we matched them is valid and there was no problems with the name (since there are new and old ones and the article is from 2012). :return: None """ loader = GeneTableLoader() result = loader.loadUnprocessed("genes.col") table = loader.createTable("geneDB", result) count = {} for entry in result: name = entry["NAME".lower()].lower() if not count.has_key(name): count[name] = 1 else: count[name] += 1 if count[name] > 1: print "warning" old_count = {} warnings = "" old_warnings = "" for entry in result: for name in entry["other_names"]: lower_name = name.lower() if count.has_key(lower_name): count[lower_name] += 1 warnings += ("warning: %s is an old name of %s = %d\n" % (lower_name, entry["NAME".lower()], count[lower_name])) elif (old_count.has_key(lower_name)): old_count[lower_name] += 1 old_warnings += ("warning: old multiplicity for %s = %d\n" % (lower_name, old_count[lower_name])) else: old_count[lower_name] = 1 print "-" * 100 print "counting instances of major names and old names" print "-" * 100 print warnings print "-" * 100 print "counting instances of old names between themselves" print "-" * 100 print old_warnings print "-" * 100 # matches = table.is_overlaps(510860, 510863, "+") # # for match in matches: # res, id = match # print res, table.findById(id) # # print table.findByName("ybaS") pdfLoader = PDFTableLoader() result = pdfLoader.loadUnprocessed("./parsed_files/s5.table") pdfTable = pdfLoader.createTable("s5", result) print "-" * 100 print "print matches to majors" print "-" * 100 other_name_matches = "" total_count = {} total_count.update(count) total_count.update(old_count) #table_name = "srna-name" table_name = "name" # find matching candidates to copy their directionality for entry in result: match = table.findByField("name", entry[table_name]) if (match != (None, None)): first_entry_start, first_entry_end, dummy_a,\ second_entry_start, second_entry_end, dummy_b = \ pdfTable.findByField(table_name, entry[table_name])[0].split(";") startDiff = int(first_entry_start) endDiff = int(first_entry_end) first_entry_start, first_entry_end, dummy_a,\ second_entry_start, second_entry_end, dummy_b = match[0].split(";") startDiff -= int (int(first_entry_start)) endDiff -= int(first_entry_end) print "%s from PDF match to: %s diff is: %d;%d" % (entry[table_name], match[0], startDiff, endDiff) if(total_count[entry[table_name].lower()] > 1): print "%s has multiple instances" % entry[table_name] else: match = table.findByOtherNames(entry[table_name]) other_name_matches += "%s match to other names: %s\n" % (entry[table_name], match[0]) if (match[0] != None): first_entry_start, first_entry_end, dummy_a,\ second_entry_start, second_entry_end, dummy_b = \ pdfTable.findByField(table_name, entry[table_name])[0].split(";") startDiff = int(first_entry_start) endDiff = int(first_entry_end) first_entry_start, first_entry_end, dummy_a,\ second_entry_start, second_entry_end, dummy_b = match[0].split(";") startDiff -= int (int(first_entry_start)) endDiff -= int(first_entry_end) other_name_matches += "the diff is %d;%d\n" % (startDiff, endDiff) if (total_count.has_key(entry[table_name])): other_name_matches += "%s appears %d in countings\n" % \ (entry[table_name], total_count[entry[table_name]]) print "-" * 100 print "matches to old names (in case of no match to major)" print "-" * 100 print other_name_matches