示例#1
0
def generateUniqueID(paper):
    """
    Returns a simple string id that is the mashup of the title and authors

    :param paper:
    :return:
    """
    author_bit = ''
    if paper.extra_data.get('xref_author'):
        authors = paper.extra_data['xref_author']
    else:
        try:
            authors = parseBibAuthors(paper.authors)
        except:
            print("Failed to parse authors string", paper.authors)
            authors = [{'given': '', 'family': ''}]

    for author in authors:
        if isinstance(author, str):
            author_bit += author
        else:
            if author.get('family'):
                author_bit += author.get('family', '_')[0] + author.get(
                    'given', '_')[0]

    title_bit = normalizeTitle(paper.title)
    title_bit = re.sub("\s+", "", title_bit)
    full_id = title_bit + "_" + author_bit
    full_id = full_id.lower()

    return full_id
示例#2
0
    def findPapersByTitle(self, title):
        """
        Looks for a paper given a title.

        :param title:
        :return:
        """
        c = self.conn.cursor()
        norm_title = normalizeTitle(title)

        c.execute("SELECT * FROM papers WHERE norm_title=?", (norm_title, ))
        paper_records = c.fetchall()
        if not paper_records:
            return None

        res = []
        for paper_record in paper_records:
            res.append(Paper.fromRecord(paper_record))
        return res
示例#3
0
 def norm_title(self):
     return normalizeTitle(self.title)
示例#4
0
    def findPaperByApproximateTitle(self,
                                    paper,
                                    ok_title_distance=0.35,
                                    ok_author_distance=0.1):
        """
        Very simple ngram-based similarity matching

        :param title:
        :return:
        """
        c = self.conn.cursor()

        self.createVirtualTable()

        norm_title = normalizeTitle(paper.title)

        bits = norm_title.split()
        bits = [b for b in bits if b not in stopwords]

        query_string = " OR ".join(bits)

        c.execute(
            'SELECT id, norm_title FROM papers_search WHERE norm_title MATCH ?',
            (query_string, ))
        paper_ids = c.fetchall()
        if not paper_ids:
            return None

        paper_id_list = [res['id'] for res in paper_ids]
        id_query_string = ",".join(['"%s"' % res['id'] for res in paper_ids])

        c.execute('SELECT * FROM papers WHERE id IN (%s)' % id_query_string)
        paper_records = c.fetchall()
        if not paper_records:
            return None

        results = [Paper.fromRecord(r) for r in paper_records]

        sorted_results = rerankByTitleSimilarity(results, paper.title)

        top_res = sorted_results[0][1]

        title_distance = dist.distance(top_res.title.lower(),
                                       paper.title.lower())
        author_distance = computeAuthorDistance(paper, top_res)

        if title_distance <= ok_title_distance and author_distance <= ok_author_distance:
            print('\n[matched] ', paper.title)
            print('Best match:', top_res.title)
        else:
            print('\n[skipped] ', paper.title)
            print(
                'Options:\n' +
                '\n'.join([r[1].title for r in sorted_results[:5]]), '\n')
            return None

        print('title distance:', title_distance, 'author distance:',
              author_distance)

        new_paper = top_res
        # new_paper.title = paper.title

        return new_paper
示例#5
0
def buildHashTable(bib):
    res = {}
    for entry in bib:
        norm_title = normalizeTitle(entry['title'])
        res[norm_title] = entry
    return res