def generateUniqueID(paper): """ Returns a simple string id that is the mashup of the title and authors :param paper: :return: """ author_bit = '' if paper.extra_data.get('xref_author'): authors = paper.extra_data['xref_author'] else: try: authors = parseBibAuthors(paper.authors) except: print("Failed to parse authors string", paper.authors) authors = [{'given': '', 'family': ''}] for author in authors: if isinstance(author, str): author_bit += author else: if author.get('family'): author_bit += author.get('family', '_')[0] + author.get( 'given', '_')[0] title_bit = normalizeTitle(paper.title) title_bit = re.sub("\s+", "", title_bit) full_id = title_bit + "_" + author_bit full_id = full_id.lower() return full_id
def findPapersByTitle(self, title): """ Looks for a paper given a title. :param title: :return: """ c = self.conn.cursor() norm_title = normalizeTitle(title) c.execute("SELECT * FROM papers WHERE norm_title=?", (norm_title, )) paper_records = c.fetchall() if not paper_records: return None res = [] for paper_record in paper_records: res.append(Paper.fromRecord(paper_record)) return res
def norm_title(self): return normalizeTitle(self.title)
def findPaperByApproximateTitle(self, paper, ok_title_distance=0.35, ok_author_distance=0.1): """ Very simple ngram-based similarity matching :param title: :return: """ c = self.conn.cursor() self.createVirtualTable() norm_title = normalizeTitle(paper.title) bits = norm_title.split() bits = [b for b in bits if b not in stopwords] query_string = " OR ".join(bits) c.execute( 'SELECT id, norm_title FROM papers_search WHERE norm_title MATCH ?', (query_string, )) paper_ids = c.fetchall() if not paper_ids: return None paper_id_list = [res['id'] for res in paper_ids] id_query_string = ",".join(['"%s"' % res['id'] for res in paper_ids]) c.execute('SELECT * FROM papers WHERE id IN (%s)' % id_query_string) paper_records = c.fetchall() if not paper_records: return None results = [Paper.fromRecord(r) for r in paper_records] sorted_results = rerankByTitleSimilarity(results, paper.title) top_res = sorted_results[0][1] title_distance = dist.distance(top_res.title.lower(), paper.title.lower()) author_distance = computeAuthorDistance(paper, top_res) if title_distance <= ok_title_distance and author_distance <= ok_author_distance: print('\n[matched] ', paper.title) print('Best match:', top_res.title) else: print('\n[skipped] ', paper.title) print( 'Options:\n' + '\n'.join([r[1].title for r in sorted_results[:5]]), '\n') return None print('title distance:', title_distance, 'author distance:', author_distance) new_paper = top_res # new_paper.title = paper.title return new_paper
def buildHashTable(bib): res = {} for entry in bib: norm_title = normalizeTitle(entry['title']) res[norm_title] = entry return res