예제 #1
0
    def process_train(self, selected, train):
        '''
        训练数据预处理
        '''
        obs = len(selected) * 2  # 全体样本的数量
        nvars = len(self.f2v)  # 特征向量维度
        X = np.zeros(shape=(obs, nvars), dtype=np.float32)
        y = np.zeros(shape=(obs), dtype=np.float32)
        obsnum = 0

        # 将每个样本转换为特征向量
        for example in tqdm(train):
            cid = example['id']
            if cid in selected:
                claim = example['claim']
                c_toks = set(word_tokenize(claim.lower()))
                for yn in selected[cid]:
                    [title, lid, line, tscore] = selected[cid][yn]
                    t_toks = normalize_title(title)
                    t = ' '.join(t_toks)
                    t_toks = set(t_toks)
                    l_toks = set(word_tokenize(line.lower()))
                    self.process_instance(c_toks, t, t_toks, line, l_toks, lid,
                                          tscore, obsnum, X)
                    y[obsnum] = float(yn)
                    obsnum += 1

        assert obsnum == obs
        return X, y
예제 #2
0
def phrase_features(phrase="", start=0, title="", claim=""):
    features = dict()
    stoks = phrase.split()
    t_toks, rmndr = normalize_title(title, rflag=True)
    features["rmndr"] = (rmndr == "")
    features["rinc"] = ((rmndr != "") and (rmndr in claim))
    features["start"] = start
    features["start0"] = (start == 0)
    features["lend"] = len(stoks)
    features["lend1"] = (features["lend"] == 1)
    features["cap1"] = stoks[0][0].isupper()
    features["stop1"] = (stoks[0].lower() in stop)
    features["people1"] = (stoks[0] in people)
    features["places1"] = (stoks[0] in places)
    features["capany"] = False
    features["capall"] = True
    features["stopany"] = False
    features["stopall"] = True
    features["peopleany"] = False
    features["peopleall"] = True
    features["placesany"] = False
    features["placesall"] = True
    for tok in stoks:
        features["capany"] = (features["capany"] or tok[0].isupper())
        features["capall"] = (features["capall"] and tok[0].isupper())
        features["stopany"] = (features["stopany"] or tok.lower() in stop)
        features["stopall"] = (features["stopall"] and tok.lower() in stop)
        features["peopleany"] = (features["peopleany"] or tok in people)
        features["peopleall"] = (features["peopleall"] and tok in people)
        features["placesany"] = (features["placesany"] or tok in places)
        features["placesall"] = (features["placesall"] and tok in places)
    return features
예제 #3
0
def title_edict(t2jnum={}):
    edocs = edict()
    for title in t2jnum:
        l_txt = normalize_title(title)
        if len(l_txt) > 0:
            if edocs[l_txt][0] is None:
                edocs[l_txt] = []
            edocs[l_txt][0].append(title)
    return edocs
예제 #4
0
def normalize_fields(data):
    if data.get('authors') is None:
        data['authors'] = []
    authors = {util.normalize_author(a) for a in data['authors']}
    data['norm-authors'] = sorted(authors)
    if data.get('title') is None:
        data['title'] = ''
    data['norm-title'] = util.normalize_title(data['title'])
    return data
예제 #5
0
 def mint_page_for_url(self, page_class, url):
     my_page = page_class()
     my_page.pmh_id = self.id
     my_page.url = url
     my_page.doi = self.doi
     my_page.title = self.title
     my_page.normalized_title = normalize_title(self.title)
     my_page.authors = self.authors
     my_page.repo_id = self.id.split(":")[1]
     return my_page
예제 #6
0
def title_edict(t2jnum={}):
    '''
    建立文档标题的字典
    '''
    edocs = edict()
    for title in t2jnum:
        _title = normalize_title(title)
        if len(_title) > 0:
            if edocs[_title][0] is None:
                edocs[_title] = []
            edocs[_title][0].append(title)
    return edocs
예제 #7
0
def phrase_features(phrase='', start=0, title='', claim=''):
    '''
    构建句子的特征字典:

    参数:
    title: 文档标题
    phrase: claim 中的短语
    claim: 对比的声明
    start: phrase 在 claim 中的位置

    返回值:
    features: 句子的特征字典
    '''
    features = dict()  # 特征字典
    stoks = phrase.split()  # 分词
    _, rmndr = normalize_title(title, rflag=True)  # 标准化并分割标题

    features['rmndr'] = (rmndr == '')  # True: 不存在潜在信息:(xxx)
    features['rinc'] = (
        (rmndr != '')
        and (rmndr in claim))  # True: 存在潜在信息:(xxx)且 xxx 在 claim 存在
    features['start'] = start  # 在 claim 中标题的位置
    features['start0'] = (start == 0)  # 在 claim 首部
    features['lend'] = len(stoks)  # 词数
    features['lend1'] = (features['lend'] == 1)  # True: 只有一个单词
    features['cap1'] = stoks[0][0].isupper()  # True: 第一个单词是首字母是大写
    features['stop1'] = (stoks[0].lower() in stop)  # True:第一个单词是停用词
    features['people1'] = (stoks[0] in people)  # True:第一个单词是人名
    features['places1'] = (stoks[0] in places)  # True:第一个单词是地名
    features['capany'] = False  # True:包含首字母大写的单词
    features['capall'] = True  # True:每个单词的首字母都是大写
    features['stopany'] = False  # True:存在停用词
    features['stopall'] = True  # True:所有词都为停用词
    features['peopleany'] = False  # True:存在人名
    features['peopleall'] = True  # True:所有词都为人名
    features['placesany'] = False  # True:存在地名
    features['placesall'] = True  # True:所有词都为地名

    for tok in stoks:
        features['capany'] = (features['capany'] or tok[0].isupper())
        features['capall'] = (features['capall'] and tok[0].isupper())
        features['stopany'] = (features['stopany'] or tok.lower() in stop)
        features['stopall'] = (features['stopall'] and tok.lower() in stop)
        features['peopleany'] = (features['peopleany'] or tok in people)
        features['peopleall'] = (features['peopleall'] and tok in people)
        features['placesany'] = (features['placesany'] or tok in places)
        features['placesall'] = (features['placesall'] and tok in places)

    return features
예제 #8
0
    def calc_normalized_title(self):
        if not self.title:
            return None

        working_title = self.title

        # repo specific rules
        # AMNH adds biblio to the end of titles, which ruins match.  remove this.
        # example http://digitallibrary.amnh.org/handle/2246/6816
        if "amnh.org" in self.repo_id:
            # cut off the last part, after an openning paren
            working_title = re.sub(u"(Bulletin of.+no.+\d+)", "",
                                   working_title, re.IGNORECASE | re.MULTILINE)
            working_title = re.sub(u"(American Museum nov.+no.+\d+)", "",
                                   working_title, re.IGNORECASE | re.MULTILINE)

        return normalize_title(working_title)
예제 #9
0
def best_lines(claim="",tscores=list(),lines=dict(),best=5,model=None):
    lscores=list()
    c_toks=set(word_tokenize(claim.lower()))
    for title,tscore in tscores:
        t_toks=normalize_title(title)
        t=" ".join(t_toks)
        t_toks=set(t_toks)
        for lid in lines[title]:
            line=lines[title][lid]
            l_toks=set(word_tokenize(line.lower()))
            if len(l_toks) > 0:
                if model==None:
                    lscores.append((title,lid,score_line(line_features(c_toks,t,t_toks,line,l_toks,lid,tscore))))
                else:
                    lscores.append((title,lid,model.score_instance(c_toks,t,t_toks,line,l_toks,lid,tscore)))
    lscores=sorted(lscores,key=lambda x:-1*x[2])[:best]
    return lscores
예제 #10
0
파일: pmh_record.py 프로젝트: todun/oadoi
    def calc_normalized_title(self):
        if not self.title:
            return None

        working_title = self.title

        # repo specific rules
        # AMNH adds biblio to the end of titles, which ruins match.  remove this.
        # example http://digitallibrary.amnh.org/handle/2246/6816 oai:digitallibrary.amnh.org:2246/6816
        if "amnh.org" in self.id:
            # cut off the last part, after an openning paren
            working_title = re.sub(u"(Bulletin of.+no.+\d+)", "",
                                   working_title, re.IGNORECASE | re.MULTILINE)
            working_title = re.sub(u"(American Museum nov.+no.+\d+)", "",
                                   working_title, re.IGNORECASE | re.MULTILINE)

        # for endpoint 0dde28a908329849966, adds this to end of all titles, so remove (eg http://hdl.handle.net/11858/00-203Z-0000-002E-72BD-3)
        working_title = re.sub(u"vollständige digitalisierte Ausgabe", "",
                               working_title, re.IGNORECASE | re.MULTILINE)
        return normalize_title(working_title)
예제 #11
0
def best_lines(claim='', tscores=list(), lines=dict(), best=5, model=None):
    '''
    计算在得分最高(前 best 个)的 line
    '''

    lscores = list()
    c_toks = set(word_tokenize(claim.lower()))

    for title, tscore in tscores:
        t_toks = normalize_title(title)  # 对 title 进行处理
        t = ' '.join(t_toks)
        t_toks = set(t_toks)
        for lid in lines[title]:  # 获取标题对应句子的行号
            line = lines[title][lid]
            l_toks = set(word_tokenize(line.lower()))
            if len(l_toks) > 0:
                lscores.append((title, lid,
                                model.score_instance(c_toks, t, t_toks, line,
                                                     l_toks, lid, tscore)))
    lscores = sorted(lscores, key=lambda x: -1 * x[2])[:best]
    return lscores
예제 #12
0
 def process_train(self,selected,train):
     obs=len(selected)*2
     nvars=len(self.f2v)
     X=np.zeros(shape=(obs,nvars),dtype=np.float32)
     y=np.zeros(shape=(obs),dtype=np.float32)
     obsnum=0
     for example in tqdm(train):
         cid=example["id"]
         if cid in selected:
             claim=example["claim"]
             c_toks=set(word_tokenize(claim.lower()))
             for yn in selected[cid]:
                 [title,lid,line,tscore]=selected[cid][yn]
                 t_toks=normalize_title(title)
                 t=" ".join(t_toks)
                 t_toks=set(t_toks)
                 l_toks=set(word_tokenize(line.lower()))
                 self.process_instance(c_toks,t,t_toks,line,l_toks,lid,tscore,obsnum,X)
                 y[obsnum]=float(yn)
                 obsnum+=1
     assert obsnum==obs
     return X,y
예제 #13
0
def main():
    """print title each per one line from the corpus"""

    year = 2014
    # months = ['01', '02', '03', '04', '05', '06', '07']  # 2015-08-05
    months = range(11, 13)
    # months = ['02'] # 2015-08-13
    # months = ['02', '03', '04', '05'], 2015-08-05
    # months = ['03']  # 2015-08-13

    days = xrange(1, 32)
    paths = [
        '/cs/puls/Corpus/Business/Puls/{}/{}/{:2d}/'.format(year, month, day)
        for month in months for day in days
    ]

    collected = 0
    for i, fname in enumerate(get_file_names(paths)):
        if i % 100 == 0:
            logger.info("{} / {}".format(collected, i))

        try:
            title = extract_title(fname)
        except:
            logger.debug('Fail to find title')
            continue

        if not title:  # no title
            continue

        title = normalize_title(title)

        # is not monocase and is English
        if not is_monocase(nltk.word_tokenize(title)) and\
           guessLanguage(title) == "en":
            body = get_document_content_paf(fname)
            if len(body.strip()) > 0:  # non-empty
                collected += 1
                print json.dumps([fname, unicode(title).encode("utf8")])
예제 #14
0
    def update(self):
        if not self.crossref_api_raw_new:
            self.crossref_api_raw_new = self.crossref_api_raw

        if not self.title:
            self.title = self.crossref_title
        self.normalized_title = normalize_title(self.title)

        old_response_jsonb = self.response_jsonb

        self.clear_results()
        try:
            self.recalculate()
        except NoDoiException:
            logger.info(u"invalid doi {}".format(self))
            self.error += "Invalid DOI"
            pass

        self.set_results()

        if self.has_changed(old_response_jsonb):
            self.last_changed_date = datetime.datetime.utcnow().isoformat()
def main():
    """print title each per one line from the corpus"""
    
    year = 2014
    # months = ['01', '02', '03', '04', '05', '06', '07']  # 2015-08-05
    months = range(11, 13)
    # months = ['02'] # 2015-08-13
    # months = ['02', '03', '04', '05'], 2015-08-05
    # months = ['03']  # 2015-08-13
    
    days = xrange(1, 32)
    paths = ['/cs/puls/Corpus/Business/Puls/{}/{}/{:2d}/'.format(year, month, day)
             for month in months
             for day in days]

    collected = 0
    for i, fname in enumerate(get_file_names(paths)):
        if i % 100 == 0:
            logger.info("{} / {}".format(collected, i))

        try:
            title = extract_title(fname)
        except:
            logger.debug('Fail to find title')
            continue

        if not title:  # no title
            continue
            
        title = normalize_title(title)
        
        # is not monocase and is English
        if not is_monocase(nltk.word_tokenize(title)) and\
           guessLanguage(title) == "en":
            body = get_document_content_paf(fname)
            if len(body.strip()) > 0:  # non-empty
                collected += 1
                print json.dumps([fname, unicode(title).encode("utf8")])
예제 #16
0
    def mint_pages(self):
        if u"oai:" not in self.id:
            return

        self.pages = []

        for url in self.get_good_urls():
            if self.doi:
                my_page = self.mint_page_for_url(PageDoiMatch, url)
                self.pages.append(my_page)

            if self.title:
                normalized_title = normalize_title(self.title)
                if normalized_title:
                    my_page = self.mint_page_for_url(PageTitleMatch, url)
                    pages_with_this_normalized_title = PageTitleMatch.query.filter(
                        PageTitleMatch.normalized_title ==
                        normalized_title).all()
                    if len(pages_with_this_normalized_title) >= 20:
                        my_page.more_than_20 = True
                    self.pages.append(my_page)

        return self.pages
예제 #17
0
def phrase_features(phrase="",
                    start=0,
                    title="",
                    claim="",
                    ctoks=word_tokenize("dummy"),
                    termfreqs=dict()):
    features = dict()
    stoks = phrase.split()
    t_toks, rmndr = normalize_title(title, rflag=True)
    features["terms"] = 0
    features["terms0"] = 0
    numtoks = 0
    for tok in ctoks:
        if tok in termfreqs:
            tf, tf0 = termfreqs[tok]
            features["terms"] += (tf > 0)
            features["terms0"] += (tf0 > 0)
            numtoks += 1
    features["rmndr"] = (rmndr == "")
    features["rinc"] = ((rmndr != "") and (rmndr in claim))
    features["start"] = start
    features["start0"] = (start == 0)
    features["lend"] = len(stoks)
    features["lend1"] = (features["lend"] == 1)
    features["cap1"] = stoks[0][0].isupper()
    features["stop1"] = (stoks[0].lower() in stop)
    features["capany"] = False
    features["capall"] = True
    features["stopany"] = False
    features["stopall"] = True
    for tok in stoks:
        features["capany"] = (features["capany"] or tok[0].isupper())
        features["capall"] = (features["capall"] and tok[0].isupper())
        features["stopany"] = (features["stopany"] or tok.lower() in stop)
        features["stopall"] = (features["stopall"] and tok.lower() in stop)
    return features
예제 #18
0
def normalize_title(meta):
    meta['norm-title'] = util.normalize_title(meta['title'])
    return meta
def api_to_db(query_doi=None,
              first=None,
              last=None,
              today=False,
              chunk_size=None):
    i = 0
    records_to_save = []

    headers = {"Accept": "application/json", "User-Agent": "impactstory.org"}

    base_url_with_last = "http://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows=1000&cursor={next_cursor}"
    base_url_no_last = "http://api.crossref.org/works?filter=from-created-date:{first}&rows=1000&cursor={next_cursor}"
    base_url_doi = "http://api.crossref.org/works?filter=doi:{doi}"

    # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates
    # base_url_with_last = "http://api.crossref.org/works?filter=from-indexed-date:{first},until-indexed-date:{last}&rows=1000&cursor={next_cursor}"
    # base_url_no_last = "http://api.crossref.org/works?filter=from-indexed-date:{first}&rows=1000&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    num_so_far = 0

    if today:
        last = datetime.date.today().isoformat()
        first = (datetime.date.today() -
                 datetime.timedelta(days=2)).isoformat()

    if not first:
        first = "2016-04-01"

    while has_more_responses:
        if query_doi:
            url = base_url_doi.format(doi=query_doi)
        else:
            if last:
                url = base_url_with_last.format(first=first,
                                                last=last,
                                                next_cursor=next_cursor)
            else:
                # query is much faster if don't have a last specified, even if it is far in the future
                url = base_url_no_last.format(first=first,
                                              next_cursor=next_cursor)

        logger.info(u"calling url: {}".format(url))
        start_time = time()
        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds".format(
            elapsed(start_time, 2)))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(
                resp.status_code))
            return

        resp_data = resp.json()["message"]
        next_cursor = resp_data.get("next-cursor", None)
        if next_cursor:
            next_cursor = quote(next_cursor)

        if not resp_data["items"] or not next_cursor:
            has_more_responses = False

        for api_raw in resp_data["items"]:
            # logger.info(u":")
            api = {}
            doi = api_raw["DOI"].lower()

            # using _source key for now because that's how it came out of ES and
            # haven't switched everything over yet
            api["_source"] = build_crossref_record(api_raw)
            api["_source"]["doi"] = doi

            my_pub = Pub(id=doi, api=api, api_raw=api_raw)

            my_pub.title = api["_source"]["title"]
            my_pub.normalized_title = normalize_title(my_pub.title)
            db.session.merge(my_pub)
            logger.info(u"got record {}".format(my_pub))
            records_to_save.append(my_pub)

            if len(records_to_save) >= 100:
                safe_commit(db)
                num_so_far += len(records_to_save)
                records_to_save = []
                logger.info(
                    u"committing.  have committed {} so far, in {} seconds, is {} per hour"
                    .format(num_so_far, elapsed(start_time, 1),
                            num_so_far / (elapsed(start_time, 1) / (60 * 60))))

        logger.info(u"at bottom of loop")

    # make sure to get the last ones
    logger.info(u"saving last ones")
    safe_commit(db)
    logger.info(u"done everything")