Пример #1
0
def get_crawler_gamename(games_to_insert, logger):
    crawler_infos = []
    gamename_mapping = get_mapping_gamename()
    logger.debug("has %d pkgs in mapping" % len(gamename_mapping))
    for key in games_to_insert:
        key_info = key.split('<>')
        area = key_info[0]
        company = key_info[1]
        pkg_name = key_info[2]
        if pkg_name not in gamename_mapping:
            api = login(logger)
            info = get_info(api, pkg_name, logger)
            info = api.toDict(info)
            info = str(info)
            info = info.replace("true", "True")
            info = info.replace("false", "False")
            info = eval(str(info))
            doc = info["docV2"]
            game_name = doc["title"]
            gama_name = ftoj(game_name)
            gamename_mapping[pkg_name] = game_name
            info = {
                    "game_name": game_name,
                    "pkg_name": pkg_name
                }
            crawler_infos.append(info)
    logger.debug("should insert %s pkg in mapping" % len(crawler_infos))
    return crawler_infos
Пример #2
0
def get_terms():
    scrapped_terms = (term for func in term_scrapers for term in func()
                      if len(term) > 1)
    #known_terms = (term for lexicon_type in known_filenames for term in get_terms_from_file(lexicon_type) if len(term) > 1)
    known_terms = []
    terms = itertools.chain.from_iterable([scrapped_terms, known_terms])
    return (jianfan.ftoj(term) for term in terms)
Пример #3
0
def get_matches_query(query_obj):
    standardize = lambda text: jianfan.ftoj(text.lower())

    def matches_query(post):
        inputs = [
            standardize(arg) in standardize(post['text'])
            for arg in query_obj.args
        ]
        return query_obj.match(*inputs)

    return matches_query
Пример #4
0
def match_names(official, other):
    if official == other:
        return 'exact'
    elif official in other or other in official:
        return 'substring'
    tr = jianfan.ftoj(other)  # Avoid repeated calls
    if official == tr:
        return 'translated'
    elif official in tr or tr in official:
        return 'translated substring'
    else:
        return False
Пример #5
0
def crawler(pkg_name):
    api = login()
    info = get_info(api, pkg_name)
    info = api.toDict(info) 
    info = str(info)
    info = info.replace("true", "True")
    info = info.replace("false", "False")
    info = eval(str(info))
    doc = info["docV2"]
    game_name = doc["title"]
    game_name = ftoj(game_name)
    return game_name
Пример #6
0
def _match_names(official, other):
    """Return a string describing the match between *official* and *other*."""
    if official == other:
        return 'exact'
    elif official in other or other in official:
        return 'substring'
    tr = jianfan.ftoj(other)  # Avoid repeated calls
    if official == tr:
        return 'translated'
    elif official in tr or tr in official:
        return 'translated substring'
    else:
        return False
Пример #7
0
def search(query):
    query = normalize(query)

    if is_cjk(query):
        search_field = 'simplified'
        search_field_display = 'Hanzi'

        # we can support both traditional and simplified queries, by converting traditional to simplified.
        # (traditional->simplified is many to one, which means it's much harder to go the other way.)
        # luckily someone make a pip-installable library jianfan!
        query = jianfan.ftoj(query)
    elif is_pinyin(query):
        search_field = 'pinyin'
        search_field_display = 'Pinyin'
    else:
        search_field = 'english_full'
        search_field_display = 'English'

    results_list = []
    if search_field == 'english_full':
        whoosh_q = query_parser.parse(query)
        whoosh_results_list = searcher.search(whoosh_q)
        # whoosh returns dictionary-like Hit objects. convert to explicit dictionaries.
        whoosh_results_list = [dict(result) for result in whoosh_results_list]
        results_list.extend(whoosh_results_list)

    if not results_list: # only resort to mongo scanning if no results found yet.
        regex_str = query.replace(' ', '.*')
        # note: typically you want to escape user-generated strings before turning them into regexes.
        # in this case, i don't care.
        regex = re.compile(regex_str, re.UNICODE | re.IGNORECASE)

        results_list.extend(list(
            THE_DICTIONARY.find({search_field: regex}, {'_id': 0}).limit(MAX_RESULTS)
            ))

    # comment out this line to see the effect of search result ranking.
    results_list.sort(cmp=make_cmp(query, search_field))

    results = dict(
        results = results_list,
        count = len(results_list),
        search_type = search_field_display,
        )

    return simplejson.dumps(results, indent=4)
Пример #8
0
def search(query, age_check=None):
    query = jianfan.ftoj(query)
    logging.debug('Requesting 163 for query <{0}>'.format(repr(query)))

    params = {'q': query.encode('utf8'), 'per_page': 20}
    for page in range(25):
        params['page'] = page + 1
        response = get_client().statuses_search(params)
        response = json.loads(response)
        if not response: break
        if type(response) is dict and response.get('error'):
            logging.warning(u'Netease has hit its access limit')
            break
        for result in response:
            if age_check and age_check(result.get('created_at')):
                raise StopIteration
            yield get_tweet(result)
Пример #9
0
def search(query, age_check=None):
    query = jianfan.ftoj(query)
    logging.debug('Requesting Tencent for query <{0}>'.format(repr(query)))
    for page in range(17):
        try:
            results = get_client().search.tweet(query, 30, page + 1)
        except qqweibo.error.QWeiboError as error:
            logging.warning(u'Received Tencent error <{0}>'.format(
                error.message))
            if u'rate' in error.message: break
            continue
        usernames = [tweet.name for tweet in results]
        users = dict(
            (user.name, user) for user in get_client()._user_infos(usernames))
        for tweet in results:
            if age_check and age_check(tweet.timestamp): raise StopIteration
            if query not in tweet.text: continue
            yield get_tweet(tweet, users)
Пример #10
0
def search(query, age_check=None):
    query = jianfan.ftoj(query)
    logging.debug('Requesting Sina for query <{0}>'.format(repr(query)))
    for page in range(25):
        uri = u'http://s.weibo.com/weibo/{0}?page={1}'.format(query, page + 1)
        response = requests.get(uri)
        next_page = False
        user_posts = collections.defaultdict(list)
        for post_info in scrape_posts(response.text):
            if age_check and age_check(post_info.get('datetime')): continue
            next_page = True  # if at least one post one page passes age_check, try next page
            user_posts[post_info.pop('uid')].append(post_info)
        user_infos = get_user_infos(user_posts.keys())
        for uid, post_infos in user_posts.items():
            for post_info in post_infos:
                post_info.update(user_infos.get(uid) or {})
                yield post_info
        if not next_page: break
def writetoaiml (input,outputset):
    global out,count
    out.write("  <category>\n")
    out.write("    <pattern>")
    words=ftoj(input.decode('utf-8')).encode('utf-8')
    words=words.replace("&","&amp;")
    words=words.replace("<","&lt;")
    words=words.replace(">","&gt;")
    words=words.replace("'","&apos;")
    words=words.replace('"',"&quot;")
    out.write(words)
    out.write("</pattern>\n")
    out.write("    <template>\n")
    if len(outputset)>=1:
        out.write("      <random>\n")
        for x in outputset:
            print input
            print ftoj(x.decode('utf-8')).encode('utf-8')
            out.write("        <li>")
            words=x.replace("&","&amp;")
            words=words.replace("<","&lt;")
            words=words.replace(">","&gt;")
            words=words.replace("'","&apos;")
            words=words.replace('"',"&quot;")
            words=ftoj(words.decode('utf-8')).encode('utf-8')
            out.write(words)
            out.write("</li>\n")
            count += 1
        out.write("      </random>\n")
    else:
        print input
        x=outputset.pop()
        print ftoj(x.decode('utf-8')).encode('utf-8')
        words=x.replace("&","&amp;")
        words=words.replace("<","&lt;")
        words=words.replace(">","&gt;")
        words=words.replace("'","&apos;")
        words=words.replace('"',"&quot;")
        words=ftoj(words.decode('utf-8')).encode('utf-8')
        out.write(words+'\n')

        count += 1
    out.write("    </template>\n")
    out.write("  </category>\n")
    out.flush()
    #count += 1
    print count
Пример #12
0
def crawler(game_infos):
    update_data = []
    for pkg_name in game_infos:
        try:
            api = login()
            info = get_info(api, pkg_name)
            info = api.toDict(info)
            info = str(info)
            info = info.replace("true", "True")
            info = info.replace("false", "False")
            info = eval(str(info))
            doc = info["docV2"]
            game_name = doc["title"]
            game_name = ftoj(game_name)
            info = {
                "pkg_name": pkg_name,
                "game_name": game_name
            }
            update_data.append(info)
        except Exception as e:
            logger.debug("%s crawler error %s" % (pkg_name, e))
    return update_data
Пример #13
0
def segmentToListPerQuery(queryString):
    listPerQuery = []
    segedList = []
    out1 = re.sub("[a-zA-Z]+", "", queryString)
    out1 = re.sub("[%s]" % re.escape(string.punctuation), "", out1)
    # segString = pseg.cut(queryString.decode("utf-8"))
    dd = jianfan.ftoj(out1).encode("utf-8")
    segString = pseg.cut(dd)
    # segString = pseg.cut(queryString.decode("utf-8"))
    # segString = jieba.cut(queryString,cut_all=False)
    # print ".. ".join(segString)
    # for i in segString:
    # 	listPerQuery.append(i)

    for z in segString:
        # print z.word + "\n"
        # if z.flag == "n" or z.flag == "ns" or z.flag == "v" or z.flag == "t" or z.flag == "a" or z.flag == "nr" or z.flag == "nz" or z.flag == "i" or z.flag == "m":
        if z.flag != "x":
            # segedList.append(z.word.encode("utf-8"))
            dd = jianfan.jtof(z.word).encode("utf-8")
            # segedList.append(dd)
            segedList.append(z.word)
    return segedList
Пример #14
0
def segmentToListPerQuery(queryString):
    listPerQuery = []
    segedList = []
    out1 = re.sub('[a-zA-Z]+', '', queryString)
    out1 = re.sub('[%s]' % re.escape(string.punctuation), '', out1)
    #segString = pseg.cut(queryString.decode("utf-8"))
    dd = jianfan.ftoj(out1).encode("utf-8")
    segString = pseg.cut(dd)
    #segString = pseg.cut(queryString.decode("utf-8"))
    #segString = jieba.cut(queryString,cut_all=False)
    #print ".. ".join(segString)
    #for i in segString:
    #	listPerQuery.append(i)

    for z in segString:
        #print z.word + "\n"
        #if z.flag == "n" or z.flag == "ns" or z.flag == "v" or z.flag == "t" or z.flag == "a" or z.flag == "nr" or z.flag == "nz" or z.flag == "i" or z.flag == "m":
        if z.flag != "x":
            #segedList.append(z.word.encode("utf-8"))
            dd = jianfan.jtof(z.word).encode("utf-8")
            #segedList.append(dd)
            segedList.append(z.word)
    return segedList
Пример #15
0
def perpare_data(info, download_url, file_size, ver_code, ver_name, channel,
                 apk_info):
    if not info:
        return
    doc = info["docV2"]
    #if channel == "GG官方":
    market_channel = channel
    if channel == "samsung":
        market_channel = "三星"
    game_name = doc["title"]
    pkg_name = apk_info["pkg_name"]
    game_desc = doc["descriptionHtml"]
    if game_desc:
        game_desc = game_desc.replace("<br>",
                                      "\n").replace("<p>",
                                                    "\n").replace("<p>", "\n")
    app_details = doc['details']['appDetails']
    game_types = app_details["appCategory"][0]
    try:
        game_types = google_game_type[game_types]
    except:
        game_types = '其他'
    downloaded_cnts = doc["details"]["appDetails"]["numDownloads"]
    developer = doc['details']['appDetails']['developerName']
    utils.check_developer(developer)
    game_language = "多国语言"

    screen_shot_urls = ""
    icon_url = ""
    images = doc["image"]
    if images:
        for image in images:
            image_type = image["imageType"]
            image_url = image["imageUrl"]
            if image_type == 4:
                icon_url = image_url
            if image_type == 1:
                screen_shot_urls += image_url + "\n"

    is_crack_apk = 1  # 破解版
    min_sdk = ""
    star_num = doc["aggregateRating"]["starRating"]
    now = str(int(time.time()))

    label_info = dict()
    pkg_info = dict()

    game_name = ftoj(game_name)
    game_desc = ftoj(game_desc)

    #if channel == "GG官方":
    g_name = game_name + u"(%s)" % channel
    if channel == "samsung":
        g_name = game_name + u"(samsung)"
    #g_name = game_name + u"(GG官方)"
    #adrooy
    #g_name = game_name + u"(samsung)"
    game_id = utils.gen_label_info_id(g_name)
    g_name = g_name.replace(u"(GG官方)", "")
    if ver_name:
        ver_name = filter(lambda ch: ch in '0123456789.', ver_name)
    if 'gameid' in apk_info:
        game_id = apk_info['gameid']
    label_info["game_id"] = game_id
    label_info["game_name"] = g_name
    label_info["game_types"] = game_types
    label_info["origin_types"] = game_types
    label_info["screen_shot_urls"] = screen_shot_urls
    label_info["icon_url"] = icon_url
    label_info["detail_desc"] = game_desc
    label_info["star_num"] = utils.format_star_num(str(star_num), 2)
    label_info["download_counts"] = utils.format_install_num(downloaded_cnts)
    label_info["game_language"] = game_language
    label_info["now"] = now
    label_info["file_size"] = file_size
    label_info["ver_name"] = ver_name
    label_info["developer"] = developer

    pkg_info["market_channel"] = market_channel
    pkg_info["game_name"] = g_name
    pkg_info["pkg_name"] = pkg_name
    pkg_info["ver_code"] = ver_code
    pkg_info["ver_name"] = ver_name
    pkg_info["file_size"] = file_size
    pkg_info["download_urls"] = download_url.strip()
    pkg_info["game_desc"] = game_desc
    pkg_info["game_types"] = game_types
    pkg_info["origin_types"] = game_types
    pkg_info["downloaded_cnts"] = utils.format_install_num(downloaded_cnts)
    pkg_info["game_language"] = game_language
    pkg_info["screen_shot_urls"] = screen_shot_urls
    pkg_info["icon_url"] = icon_url
    pkg_info["now"] = now
    pkg_info["is_crack_apk"] = is_crack_apk
    if "ggvercode" not in apk_info:
        apk_info["ggvercode"] = "null"
    apk_id = utils.gen_pkg_info_id(0, pkg_name, ver_name, market_channel,
                                   apk_info["ggvercode"])
    pkg_info["apk_id"] = apk_id
    pkg_info["game_id"] = game_id
    pkg_info[
        "url4details"] = "https://play.google.com/store/apps/details?id=%s" % pkg_name
    #print apk_id, game_id
    #import sys
    #sys.exit()
    return label_info, pkg_info
Пример #16
0
    def org_headquarters(self, slot_type):
        def safe_get_sublist(l, target, head_space, tail_space):
            result = []
            target_index = [i for i, x in enumerate(l) if x[0] == target]

            for t in target_index:
                beg = t - head_space
                end = t + tail_space + 1

                if beg >= 0 and end <= len(l):
                    result = l[beg:end]
                elif beg >= 0:
                    result = l[beg:]
                elif beg <= len(l):
                    result = l[:end]
                else:
                    result = [l[t]]
            return result

        gpe_list = []
        if 'country' in slot_type:
            # load country list
            f = io.open('data/dict/country_list', 'r', -1, 'utf-8')
            for line in f:
                gpe_list.append(line.strip())
        elif 'state' in slot_type:
            # load province list
            f = io.open('data/dict/china_province_dict', 'r', -1, 'utf-8')
            for line in f:
                gpe_list.append(line.strip())
        elif 'city' in slot_type:
            # load city list
            china_province_city = cPickle.load(open('data/dict/china_province_city.pkl', 'rb'))
            for p in china_province_city:
                if p['type'] == 0 and p['name'] != (u'台湾' or u'臺灣'):  # type 0 means 直辖市
                    continue
                for c in p['sub']:
                    gpe_list.append(c['name'])
                    if p['name'] == (u'台湾' or u'臺灣'):
                        continue
                    for d in c['sub']:
                        gpe_list.append(d['name'])

        line_outputs = []

        for e in self.evidences[slot_type]:
            query_context = safe_get_sublist(e.parse_result['words'], self.query.name, 1, 0)  # get context word around query

            for w in query_context:
                v = jianfan.ftoj(w[0])
                for element in gpe_list:
                    for r in [u'区', u'县', u'市']:
                        v = v.replace(r, '')

                    if element in w[0] and len(element) > 1:
                        slot_filler = element
                        l = self.create_line_output(e, slot_filler, 0, slot_type, combined_slot_filler=True)

                        # ================ post filtering ================= #
                        if u'友好' in l.slot_filler:
                            continue

                        line_outputs.append(l)

        return line_outputs
Пример #17
0
#!/usr/bin/env python
from jianfan import ftoj
import sys

for line in sys.stdin.readlines():
    print ftoj(line).encode("utf-8").rpartition("\n")[0]
Пример #18
0
#-*- coding:utf-8
import sys
from jianfan import jtof, ftoj

f = open(sys.argv[1])
s = ftoj(("".join(f.readlines())).decode('utf-8'))
s2 = s.split(" ")
f.close()
f2 = open(sys.argv[2], "w")
f2.write(("%s\t%s" % (s2[0], int(sys.argv[4]))).encode('utf-8'))
f2.close()
f3 = open(sys.argv[3], "w")
f3.write(("%s|||%s|||%s" % (s2[1], s2[2], s2[3])).encode('utf-8'))
f3.close()
Пример #19
0
    def stateorprovince(self, slot_type, evidence_slot_type):
        current_output = self.query_answer.output[slot_type]

        city = None

        # find query's city answer.
        for line_output in self.query_answer.output[evidence_slot_type]:
            if line_output.slot_filler:
                city = line_output
        if city is None:
            return current_output

        # infer province by city
        province = ''
        evidence = ''  # evidence is a LineOutput object
        city_slot_filler = city.slot_filler
        city_slot_filler = jianfan.ftoj(city_slot_filler)
        for r in [u'区', u'县', u'市']:
            city_slot_filler = city_slot_filler.replace(r, '')

        for p in self.china_province_city:
            if province:
                break
            if p['type'] == 0:
                if city_slot_filler in [item['name'] for item in p['sub']]:
                    province = p['name']
                    evidence = city
                    break
            else:
                for c in p['sub']:
                    if city_slot_filler in [item['name'] for item in c['sub']]:
                        province = p['name']
                        evidence = city
                        break

        # if inference fails, return original answer
        if not province:
            return current_output

        # search provenance
        found_doc_path = search(province + city_slot_filler, self.searcher, self.analyzer, 50)

        if not found_doc_path:
            return current_output

        evidence_doc_path = found_doc_path[0]
        # add additional doc to source_doc for visualization
        doc_id = evidence_doc_path.split('/')[-1].strip()
        doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read()
        self.sf_object.query_docs[doc_id] = doc

        wp_beg = doc.find(province + city_slot_filler)
        wp_end = wp_beg + len(province + city_slot_filler) - 1
        sp_beg = wp_beg + doc[wp_beg:wp_end+1].find(province)
        sp_end = sp_beg + len(province) - 1

        l = LineOutput()
        l.slot_type = slot_type
        l.run_id = self.query_answer.run_id

        p = Provenance()
        p.doc_id = doc_id
        p.beg = wp_beg
        p.end = wp_end
        p.text = province+city_slot_filler
        l.wide_provenance = [p]
        evidence.wide_provenance[0].inference = True
        l.wide_provenance += evidence.wide_provenance  # evidence is a LineOutput object

        l.slot_filler = province

        p = Provenance()
        p.doc_id = doc_id
        p.beg = sp_beg
        p.end = sp_end
        p.text = province
        l.slot_filler_prov = [p]

        l.confidence_score = 1

        return current_output+[l]
Пример #20
0
    def org_headquarters(self, slot_type):
        def safe_get_sublist(l, target, head_space, tail_space):
            result = []
            target_index = [i for i, x in enumerate(l) if x[0] == target]

            for t in target_index:
                beg = t - head_space
                end = t + tail_space + 1

                if beg >= 0 and end <= len(l):
                    result = l[beg:end]
                elif beg >= 0:
                    result = l[beg:]
                elif beg <= len(l):
                    result = l[:end]
                else:
                    result = [l[t]]
            return result

        gpe_list = []
        if 'country' in slot_type:
            # load country list
            f = io.open('data/dict/country_list', 'r', -1, 'utf-8')
            for line in f:
                gpe_list.append(line.strip())
        elif 'state' in slot_type:
            # load province list
            f = io.open('data/dict/china_province_dict', 'r', -1, 'utf-8')
            for line in f:
                gpe_list.append(line.strip())
        elif 'city' in slot_type:
            # load city list
            china_province_city = cPickle.load(open('data/dict/china_province_city.pkl', 'rb'))
            for p in china_province_city:
                if p['type'] == 0 and p['name'] != (u'台湾' or u'臺灣'):  # type 0 means 直辖市
                    continue
                for c in p['sub']:
                    gpe_list.append(c['name'])
                    if p['name'] == (u'台湾' or u'臺灣'):
                        continue
                    for d in c['sub']:
                        gpe_list.append(d['name'])

        line_outputs = []

        for e in self.evidences[slot_type]:
            query_context = safe_get_sublist(e.parse_result['words'], self.query.name, 1, 0)  # get context word around query

            for w in query_context:
                v = jianfan.ftoj(w[0])
                for element in gpe_list:
                    for r in [u'区', u'县', u'市']:
                        v = v.replace(r, '')

                    if element in w[0] and len(element) > 1:
                        slot_filler = element
                        l = self.create_line_output(e, slot_filler, 0, slot_type, combined_slot_filler=True)

                        # ================ post filtering ================= #
                        if u'友好' in l.slot_filler:
                            continue

                        line_outputs.append(l)

        return line_outputs
def perpare_data(info, apk_info):
    if not info:
        return
    doc = info["docV2"]
    game_name = doc["title"]
    game_desc = doc["descriptionHtml"]
    if game_desc:
        game_desc = game_desc.replace("<br>",
                                      "\n").replace("<p>",
                                                    "\n").replace("<p>", "\n")
    app_details = doc['details']['appDetails']
    game_types = app_details["appCategory"][0]
    try:
        game_types = google_game_type[game_types]
    except:
        game_types = '其他'
    downloaded_cnts = doc["details"]["appDetails"]["numDownloads"]
    developer = doc['details']['appDetails']['developerName']
    game_language = "多国语言"
    screen_shot_urls = ""
    icon_url = ""
    images = doc["image"]
    if images:
        for image in images:
            image_type = image["imageType"]
            image_url = image["imageUrl"]
            if image_type == 4:
                icon_url = image_url
            if image_type == 1:
                screen_shot_urls += image_url + "\n"
    star_num = doc["aggregateRating"]["starRating"]
    #将繁体中文改为简体
    game_name = ftoj(game_name)
    game_desc = ftoj(game_desc)
    developer = ftoj(developer)

    pkg_info = {}
    label_info = {}

    pkg_info['apk_id'] = apk_info['apk_id']
    pkg_info['game_id'] = apk_info['gameid']
    pkg_info['market_channel'] = apk_info['channel']
    pkg_info['game_name'] = game_name
    pkg_info['pkg_name'] = apk_info['pkg_name']
    pkg_info['ver_code'] = apk_info['ver_code']
    pkg_info['ver_name'] = apk_info['ver_name']
    pkg_info['file_size'] = apk_info['file_size']
    pkg_info['download_url'] = apk_info['download_url']
    pkg_info['game_desc'] = game_desc
    pkg_info['downloaded_cnts'] = downloaded_cnts
    pkg_info['game_language'] = game_language
    pkg_info['screen_shot_urls'] = screen_shot_urls
    pkg_info['icon_url'] = icon_url
    pkg_info['min_sdk'] = apk_info['min_sdk']
    pkg_info['download_url_type'] = apk_info['download_url_type']
    pkg_info['source'] = apk_info['source']
    pkg_info['signature_md5'] = apk_info['signature_md5']
    pkg_info['file_md5'] = apk_info['file_md5']
    pkg_info['origin_types'] = game_types
    pkg_info['gpu_vender'] = apk_info['gpu_vender']
    pkg_info['ver_code_by_gg'] = apk_info['ggvercode']
    pkg_info['update_desc'] = apk_info['update_desc']
    pkg_info['file_type'] = apk_info['file_type']
    pkg_info['save_user'] = apk_info['save_user']
    pkg_info['now'] = int(time.time())

    label_info['game_id'] = apk_info['gameid']
    label_info['game_name'] = game_name
    label_info['screen_shot_urls'] = screen_shot_urls
    label_info['icon_url'] = icon_url
    label_info['detail_desc'] = game_desc
    label_info['game_language'] = game_language
    label_info['file_size'] = apk_info['file_size']
    label_info['ver_name'] = apk_info['ver_name']
    label_info['source'] = apk_info['source']
    label_info['origin_types'] = game_types
    label_info['developer'] = developer
    label_info['save_user'] = apk_info['save_user']
    label_info['enabled'] = 0
    label_info['now'] = int(time.time())
    label_info['downloaded_cnts'] = downloaded_cnts
    label_info['star_num'] = star_num
    return label_info, pkg_info
def perpare_data(info, download_url, file_size, ver_code, ver_name, channel, apk_info):
    if not info:
        return
    doc = info["docV2"]
    #if channel == "GG官方":
    market_channel = channel
    if channel == "samsung":
         market_channel = "三星"
    game_name = doc["title"]
    pkg_name = apk_info["pkg_name"]
    game_desc = doc["descriptionHtml"]
    if game_desc:
        game_desc = game_desc.replace("<br>", "\n").replace("<p>", "\n").replace("<p>", "\n")
    app_details = doc['details']['appDetails']
    game_types = app_details["appCategory"][0]
    try:
        game_types = google_game_type[game_types]
    except:
        game_types = '其他'
    downloaded_cnts = doc["details"]["appDetails"]["numDownloads"]
    developer = doc['details']['appDetails']['developerName']
    utils.check_developer(developer)
    game_language = "多国语言"

    screen_shot_urls = ""
    icon_url = ""
    images = doc["image"]
    if images:
        for image in images:
            image_type = image["imageType"]
            image_url = image["imageUrl"]
            if image_type == 4:
                icon_url = image_url
            if image_type == 1:
                screen_shot_urls += image_url + "\n"

    is_crack_apk = 1  # 破解版
    min_sdk = ""
    star_num = doc["aggregateRating"]["starRating"]
    now = str(int(time.time()))

    label_info = dict()
    pkg_info = dict()

    game_name = ftoj(game_name)
    game_desc = ftoj(game_desc)
    
    #if channel == "GG官方":
    g_name = game_name + u"(%s)" % channel
    if channel == "samsung":
          g_name = game_name + u"(samsung)"
    #g_name = game_name + u"(GG官方)"
    #adrooy
    #g_name = game_name + u"(samsung)"
    game_id = utils.gen_label_info_id(g_name)
    g_name = g_name.replace(u"(GG官方)", "")
    if ver_name:
        ver_name = filter(lambda ch: ch in '0123456789.', ver_name)
    if 'gameid' in apk_info:
        game_id = apk_info['gameid']
    label_info["game_id"] = game_id
    label_info["game_name"] = g_name
    label_info["game_types"] = game_types
    label_info["origin_types"] = game_types
    label_info["screen_shot_urls"] = screen_shot_urls
    label_info["icon_url"] = icon_url
    label_info["detail_desc"] = game_desc
    label_info["star_num"] = utils.format_star_num(str(star_num), 2)
    label_info["download_counts"] = utils.format_install_num(downloaded_cnts)
    label_info["game_language"] = game_language
    label_info["now"] = now
    label_info["file_size"] = file_size
    label_info["ver_name"] = ver_name
    label_info["developer"] = developer

    pkg_info["market_channel"] = market_channel
    pkg_info["game_name"] = g_name
    pkg_info["pkg_name"] = pkg_name
    pkg_info["ver_code"] = ver_code
    pkg_info["ver_name"] = ver_name
    pkg_info["file_size"] = file_size
    pkg_info["download_urls"] = download_url.strip()
    pkg_info["game_desc"] = game_desc
    pkg_info["game_types"] = game_types
    pkg_info["origin_types"] = game_types
    pkg_info["downloaded_cnts"] = utils.format_install_num(downloaded_cnts)
    pkg_info["game_language"] = game_language
    pkg_info["screen_shot_urls"] = screen_shot_urls
    pkg_info["icon_url"] = icon_url
    pkg_info["now"] = now
    pkg_info["is_crack_apk"] = is_crack_apk
    if "ggvercode" not in apk_info:
        apk_info["ggvercode"] = "null"
    apk_id = utils.gen_pkg_info_id(0, pkg_name, ver_name, market_channel, apk_info["ggvercode"])
    pkg_info["apk_id"] = apk_id
    pkg_info["game_id"] = game_id
    pkg_info["url4details"] = "https://play.google.com/store/apps/details?id=%s" % pkg_name
    #print apk_id, game_id
    #import sys
    #sys.exit()
    return label_info, pkg_info
Пример #23
0
def update(f, verbose=False):
    global data
#    # Read the NBS website
#    d0 = parse_raw(urllib.request.urlopen(URL))
    # commented: Use a cached copy of the website
    codes = parse_raw(f)

    # Save the latest table
    with open(data_fn('latest'), 'w') as f1:
        w = csv.writer(f1, lineterminator=linesep)
        w.writerow(['code', 'name_zh', 'level'])
        for code in sorted(codes.keys()):
            w.writerow([code, codes[code]['name_zh'], codes[code]['level']])

    # Load the CITAS table
    d1 = load_file(open(data_fn('citas'), 'r'), 'C-gbcode', lambda row:
                   row['todate'] == '19941231')

    # Load the GB/T 2260-2007 tables, from two files
    d2 = load_file(open(data_fn('gbt_2260-2007'), 'r'), 'code')
    d3 = load_file(open(data_fn('gbt_2260-2007_sup'), 'r'), 'code')
    for code, d in d3.items():  # Merge the two GB/T 2260-2007 files
        if code in d2:  # Code appears in both files
            # Don't overwrite name_zh in gbt_2260-2007.csv with an empty
            # name_zh from gbt_2260-2007_sup.csv
            dict_update(d2[code], d, conflict=lambda a, b, k: not(k ==
                        'name_zh' or b is None))
        else:  # Code only appears in gbt_2260-2007_sup.csv
            d2[code] = d

    # Load extra data pertaining to the latest table
    d4 = load_file(open(data_fn('extra'), 'r'), 'code')

    # Merge using codes
    for code in sorted(codes.keys()):
        # Store debug information to be printed (or not) later
        message = '{}\t{}\n'.format(code, codes[code]['name_zh'])
        # Merge CITAS entry for this code
        if code not in d1:
            message += '  does not appear in CITAS data set\n'
        else:
            d = dict(d1[code])  # Make a copy
            name_zh = d.pop('N-hanzi')
            if not match_names(codes[code]['name_zh'], name_zh):
                message += '  CITAS name {} ({}) does not match\n'.format(
                    name_zh, jianfan.ftoj(name_zh))
            else:
                d['name_en'] = d.pop('N-local').replace("`", "'")
                d['name_pinyin'] = d.pop('N-pinyin').replace("`", "'")
                dict_update(codes[code], d)
        # Merge GB/T 2260-2007 entry for this code
        if code not in d2:
            message += '  does not appear in GB/T 2260-2007\n'
        else:
            d = dict(d2[code])
            if (len(d['name_zh']) and not codes[code]['name_zh'] ==
                    d['name_zh']):
                message += '  GB/T 2260-2007 name {} does not match\n'.format(
                    d['name_zh'])
            else:
                # Don't overwrite name_en from CITAS with empty name_en from
                # GB/T 2260-2007
                dict_update(codes[code], d, conflict=lambda a, b, k: not(
                            'name_' in k and b is ''))
        # Merge extra data
        if code in d4:
            dict_update(codes[code], d4[code], conflict='squash')
        if verbose and message.count('\n') > 1:
            print(message, end='')

    # TODO merge on names

    # Write the unified data set to file
    with open(data_fn('unified'), 'w') as f:
        w = csv.DictWriter(f, ('code', 'name_zh', 'name_en', 'name_pinyin',
                               'alpha', 'level', 'latitude', 'longitude'),
                           extrasaction='ignore', lineterminator=linesep)
        w.writeheader()
        for k in sorted(codes.keys()):
            w.writerow(codes[k])
Пример #24
0
def tradition_to_simple(str):
	return ftoj(str)
Пример #25
0
def update(version='2015-09-30', use_cache=False, verbose=False,
           target=None):
    """Update the database.

    :meth:`update` relies on four sources, in the following order of authority:

    1. Error corrections from ``extra.csv``.
    2. The latest list of codes from the NBS website indicated by *version*
       (see :meth:`parse_html`). For instance, *version* ‘2013-08-31’ was
       published on 2014-01-17. If *use_cache* is :py:data:`True`, then a
       cached HTML list is used from the the directory ``data/cache/`` (see
       :meth:`refresh_cache`). Otherwise, or if the cache is missing, the file
       is downloaded from the website.
    3. The data set `GuoBiao (GB) Codes for the Administrative Divisions of the
       People's Republic of China, v1 (1982 – 1992)
       <http://sedac.ciesin.columbia.edu/data/set/
       cddc-china-guobiao-codes-admin-divisions>`_ (``citas.csv``), produced by
       the NASA Socioeconomic Data and Applications Center (SEDAC), the
       University of Washington Chinese Academy of Surveying and Mapping
       (CASM), the Columbia University Center for International Earth Science
       Information Network (CIESIN) as part of the China in Time and Space
       (*CITAS*) project. This data set contains Pinyin transcriptions.
    4. The information in ``gbt_2260-2007.csv`` (provided by `@qiaolun
       <https://github.com/qiaolun>`_) and ``gbt_2260-2007_sup.csv``
       (supplement) transcribed from the published GB/T 2260-2007 standard.

    If *verbose* is :py:data:`True`, verbose output is given.

    The following files are updated:

    - ``latest.csv`` with information from source #1 only: codes, Chinese names
      (``name_zh``), and ``level``.
    - ``unified.csv`` with all database fields and information from sources #2,
      #3 and #4.
    - ``unified.db``, the same information in a :py:mod:`sqlite3` database.
    """
    _configure_log(verbose)

    if use_cache:
        try:
            fn = data_fn(os.path.join('cache', version), 'html')
            log.info('reading from cached %s', fn)
            f = open(fn, 'r')
        except FileNotFoundError:
            log.info('  missing.')
            use_cache = False

    if not use_cache:
        from urllib.request import urlopen
        log.info('retrieving codes from %s', URLS[version])
        f = urlopen(URLS[version])

    # Parse the codes from HTML
    log.info('  parsing...')
    codes = parse_html(f, version.split('-')[0])
    assert sorted(codes.keys()) == list(codes.keys())
    log.info('  done.')

    # Save the latest table
    fn = data_fn('latest', path=target)
    with open(fn, 'w') as f1:
        w = csv.writer(f1, lineterminator=linesep)
        w.writerow(['code', 'name_zh', 'level'])
        for code in sorted(codes.keys()):
            w.writerow([code, codes[code]['name_zh'], codes[code]['level']])
    log.info('wrote %s', fn)

    # Load the CITAS table
    d1 = load_csv('citas', 'C-gbcode',
                  filter=lambda row: row['todate'] == '19941231')
    log.info('read CITAS data')

    # Load the GB/T 2260-2007 tables, from two files
    d2 = load_csv('gbt_2260-2007')
    d3 = load_csv('gbt_2260-2007_sup')
    log.info('loaded GB/T 2260-2007 entries')

    # Merge the two GB/T 2260-2007 files
    for code, d in d3.items():
        if code in d2:
            # Code appears in both files. Don't overwrite name_zh in
            # gbt_2260-2007.csv with an empty name_zh from
            # gbt_2260-2007_sup.csv.
            _dict_update(d2[code], d, conflict=lambda a, b, k: not(k ==
                         'name_zh' or b is None))
        else:
            # Code only appears in gbt_2260-2007_sup.csv
            d2[code] = d

    # Load extra data pertaining to the latest table
    d4 = load_csv('extra')
    log.info('loaded extra data')

    # Regular expression for English names from the CITAS database:
    # In a name like 'Beijing: Dongcheng qu' the prefix 'Beijing: ' is a
    # repetition of the name of the parent division, and the suffix ' qu' is
    # the type, not the name, of the area.
    name_re = re.compile('(?:[^:]*: )?(.*?)(?: (%s))?$' % '|'.join(SUFFIXES))

    pinyin = Pinyin()

    # Merge using codes
    log.info('merging codes')
    for code, entry in codes.items():
        # Store debug information to be printed (or not) later
        message = ['%s\t%s' % (code, entry['name_zh'])]

        if code in d1:
            # Merge CITAS entry for this code
            # Make a copy
            d = dict(d1[code])
            name_zh = d.pop('N-hanzi')
            if not _match_names(entry['name_zh'], name_zh):
                message.append('  CITAS name %s (%s) does not match' %
                               (name_zh, jianfan.ftoj(name_zh)))
            else:
                d['name_en'] = d.pop('N-local').replace("`", "'")
                d['name_pinyin'] = d.pop('N-pinyin').replace("`", "'")
                _dict_update(entry, d)
        else:
            message.append('  does not appear in CITAS data set')

        if code in d2:
            # Merge GB/T 2260-2007 entry for this code
            d = dict(d2[code])
            if len(d['name_zh']) and entry['name_zh'] != d['name_zh']:
                message.append('  GB/T 2260-2007 name %s does not match' %
                               d['name_zh'])
            else:
                # Don't overwrite name_en from CITAS with empty name_en from
                # GB/T 2260-2007
                _dict_update(entry, d, conflict=lambda a, b, k: not(
                             'name_' in k and b is ''))
        else:
            message.append('  does not appear in GB/T 2260-2007')

        # Merge extra data
        if code in d4:
            _dict_update(entry, d4[code], conflict='squash')

        # Clean up English names (in most cases, the CITAS romanized name)
        if entry['name_en'] is not None:
            # Replace ' shixiaqu' with ' city area', but do not discard
            name_en = entry['name_en'].replace(' shixiaqu', ' city area')
            # Use regex to discard prefixes and suffixes on names
            entry['name_en'] = name_re.match(name_en).group(1)
        elif entry['name_zh'] == '市辖区':
            # Fill in blank with 'CITYNAME city area', where possible
            pname = codes[_parents(code)[1]]['name_en']
            entry['name_en'] = None if pname is None else pname + ' city area'

        # Fill in pinyin names
        if entry['name_pinyin'] is None:
            entry['name_pinyin'] = pinyin.get_pinyin(entry['name_zh'],
                                                     '').title()

        if len(message) > 1 and 'does not appear in CITAS' not in message[1]:
            log.info('\n'.join(message))
        else:
            log.debug('\n'.join(message))
    log.info('merge complete')

    # Write the unified data set to CSV
    fn = data_fn('unified', path=target)
    with open(fn, 'w') as f:
        w = csv.DictWriter(f, ('code', 'name_zh', 'name_en', 'name_pinyin',
                               'alpha', 'level', 'latitude', 'longitude'),
                           extrasaction='ignore', lineterminator=linesep)
        w.writeheader()
        for k in sorted(codes.keys()):
            w.writerow(codes[k])
    log.info('wrote %s', fn)

    write_sqlite('unified', codes, target=target)
    log.info('wrote sqlite3 database')
def perpare_data(info, apk_info):
    if not info:
        return
    doc = info["docV2"]
    game_name = doc["title"]
    game_desc = doc["descriptionHtml"]
    if game_desc:
        game_desc = game_desc.replace("<br>", "\n").replace("<p>", "\n").replace("<p>", "\n")
    app_details = doc['details']['appDetails']
    game_types = app_details["appCategory"][0]
    try:
        game_types = google_game_type[game_types]
    except:
        game_types = '其他'
    downloaded_cnts = doc["details"]["appDetails"]["numDownloads"]
    developer = doc['details']['appDetails']['developerName']
    game_language = "多国语言"
    screen_shot_urls = ""
    icon_url = ""
    images = doc["image"]
    if images:
        for image in images:
            image_type = image["imageType"]
            image_url = image["imageUrl"]
            if image_type == 4:
                icon_url = image_url
            if image_type == 1:
                screen_shot_urls += image_url + "\n"
    star_num = doc["aggregateRating"]["starRating"]
    #将繁体中文改为简体
    game_name = ftoj(game_name)
    game_desc = ftoj(game_desc)
    developer = ftoj(developer)

    pkg_info = {}
    label_info = {}

    pkg_info['apk_id'] = apk_info['apk_id']
    pkg_info['game_id'] = apk_info['gameid']
    pkg_info['market_channel'] = apk_info['channel']
    pkg_info['game_name'] = game_name
    pkg_info['pkg_name'] = apk_info['pkg_name']
    pkg_info['ver_code'] = apk_info['ver_code']
    pkg_info['ver_name'] = apk_info['ver_name']
    pkg_info['file_size'] = apk_info['file_size']
    pkg_info['download_url'] = apk_info['download_url']
    pkg_info['game_desc'] = game_desc
    pkg_info['downloaded_cnts'] = downloaded_cnts
    pkg_info['game_language'] = game_language
    pkg_info['screen_shot_urls'] = screen_shot_urls
    pkg_info['icon_url'] = icon_url
    pkg_info['min_sdk'] = apk_info['min_sdk']
    pkg_info['download_url_type'] = apk_info['download_url_type']
    pkg_info['source'] = apk_info['source']
    pkg_info['signature_md5'] = apk_info['signature_md5']
    pkg_info['file_md5'] = apk_info['file_md5']
    pkg_info['origin_types'] = game_types
    pkg_info['gpu_vender'] = apk_info['gpu_vender']
    pkg_info['ver_code_by_gg'] = apk_info['ggvercode']
    pkg_info['update_desc'] = apk_info['update_desc']
    pkg_info['file_type'] = apk_info['file_type']
    pkg_info['save_user'] = apk_info['save_user']
    pkg_info['now'] = int(time.time())

    label_info['game_id'] = apk_info['gameid']
    label_info['game_name'] = game_name
    label_info['screen_shot_urls'] = screen_shot_urls
    label_info['icon_url'] = icon_url
    label_info['detail_desc'] = game_desc
    label_info['game_language'] = game_language
    label_info['file_size'] = apk_info['file_size']
    label_info['ver_name'] = apk_info['ver_name']
    label_info['source'] = apk_info['source']
    label_info['origin_types'] = game_types
    label_info['developer'] = developer
    label_info['save_user'] = apk_info['save_user']
    label_info['enabled'] = 0
    label_info['now'] = int(time.time())
    label_info['downloaded_cnts'] = downloaded_cnts
    label_info['star_num'] = star_num
    return label_info, pkg_info
Пример #27
0
    def evidence_extaction(self):
        # ************* batch segment long article ************* #
        start = time.time()
        if os.path.exists('data/.tmp/'):
            shutil.rmtree('data/.tmp')
        os.makedirs('data/.tmp/')  # create a temperal dir for parsing large paragraphs
        for doc_id in self.cleaned_docs:
            f = io.open(os.path.join('data/.tmp', doc_id), 'w', -1, 'utf-8')
            f.write(self.cleaned_docs[doc_id])
            f.close()

        # run stanford segmenter
        stanford_nlp_dir = os.path.join(self.CN_SF_PATH,
                                        'externals/stanford-corenlp-full-2014-08-27/')
        segmenter_result = list(batch_parse('data/.tmp/',
                                            stanford_nlp_dir,
                                            properties=os.path.join(stanford_nlp_dir,
                                                                    "StanfordCoreNLP-chinese.Segmenter.properties")
                                            ))
        for r in segmenter_result:
            self.segmented_docs[r['file_name']] = r['sentences']
        print('segmenting time cost '+str(time.time()-start))

        # cpickle for development
        # cPickle.dump(self.segmented_docs, open('data/segmented_docs.pkl', 'wb'))
        # self.segmented_docs = cPickle.load(open('data/segmented_docs.pkl', 'rb'))

        # ************* select evidence ************* #
        sent_to_parse = dict()

        self.evidence = OrderedDict()
        for query in self.queries:
            print('\textracting ' + query.name)

            evidences = OrderedDict()  # {slot_type: sentence_parsed_result}
            for doc_id in self.query_docs[query.id].keys():
                seg_result = self.segmented_docs[doc_id]
                for i in xrange(len(seg_result)):  # sentence is stanford standard format output
                    sentence = seg_result[i]
                    sent_id = '|'.join([doc_id, str(i)])
                    # if sentence is too long or too short, it carries less dependency information
                    if len(sentence['words']) > 130 or len(sentence['words']) < 3:
                        continue

                    sent_text = ''.join(sentence['text'])

                    # *************** check if this sentence is an evidence ******************** #
                    # ============== common case ============= #
                    seg_sent_text = sentence['text']  # list of tokens
                    seg_sent_text = [jianfan.ftoj(w) for w in seg_sent_text]

                    # here joining s['text'] list will overcome segmentation errors
                    if query.name not in ''.join(seg_sent_text):
                        continue

                    triggers = self.triggers[query.entity_type]

                    if query.entity_type == 'PER':
                        slot_types = self.PER_SLOT_TYPE
                    elif query.entity_type == 'ORG':
                        slot_types = self.ORG_SLOT_TYPE

                    for slot_type in slot_types:
                        if slot_type not in evidences.keys():
                            evidences[slot_type] = []
                        for t in triggers[slot_type]:
                            # compare triggers to words by segmentation, might affected by segmentation errors
                            if t not in seg_sent_text:
                                continue
                            evidences[slot_type].append(Evidence(doc_id, query.id, t, sent_text, sent_id))
                            sent_to_parse[sent_id] = sent_text  # add sentence and do parallel parsing later.

                    # ============== special case ============== #
                    if query.entity_type == 'PER':
                        evidences['per:alternate_names'].append(Evidence(doc_id, query.id, '',
                                                                         sent_text, sent_id, sentence))

                    if query.entity_type == 'ORG':
                        # for org:alternate_names, the article contains the query is evidence, for pattern match
                        evidences['org:alternate_names'].append(Evidence(doc_id, query.id, '',
                                                                         sent_text, sent_id, sentence))

                        # for org:XXX_headquarters, the article contains the query is evidence, for pattern match
                        evidences['org:country_of_headquarters'].append((Evidence(doc_id, query.id, '',
                                                                                  sent_text, sent_id, sentence)))
                        evidences['org:stateorprovince_of_headquarters'].append((Evidence(doc_id, query.id, '',
                                                                                          sent_text, sent_id, sentence)))
                        evidences['org:city_of_headquarters'].append((Evidence(doc_id, query.id, '',
                                                                               sent_text, sent_id, sentence)))

            self.evidence[query.id] = evidences

        # *************** parallel parsing ****************** #
        def chunkIt(seq, num):
            avg = len(seq) / float(num)
            out = []
            last = 0.0

            while last < len(seq):
                out.append(seq[int(last):int(last + avg)])
                last += avg

            return out

        # run stanford parser in multiprocessing
        process_num = multiprocessing.cpu_count() / 2 if multiprocessing.cpu_count() / 2 < 10 else 10
        p = multiprocessing.Pool(processes=process_num)
        chunked_sent = [dict(item) for item in chunkIt(sent_to_parse.items(), process_num)]
        mp_result = [p.apply_async(stanford_parser,
                                   args=(chunked_sent[i], str(i))) for i in range(process_num)]
        mp_result = [p.get() for p in mp_result]
        sent_parsing_result = {}
        for r in mp_result:
            sent_parsing_result.update(r)

        # cpickle for development
        # cPickle.dump(sent_parsing_result, open('data/sent_parsing_result.pkl', 'wb'))
        # sent_parsing_result = cPickle.load(open('data/sent_parsing_result.pkl', 'rb'))

        # updating evidences
        for q_id in self.evidence.keys():
            evidences = self.evidence[q_id]
            for slot_type in evidences.keys():
                for e in evidences[slot_type]:
                    if not e.trigger:
                        continue
                    e.parse_result = sent_parsing_result[e.sent_id]

        # *************** correct segmenter error ******************** #
        china_province_city = cPickle.load(open('data/dict/china_province_city.pkl', 'rb'))
        province_city_list = []
        for p in china_province_city:
            province_city_list += [p['name']]
            for c in p['sub']:
                province_city_list += [c['name']]
                if p['type'] == 0:
                    continue
                for d in c['sub']:
                    province_city_list += [d['name']]

        for q_id in self.evidence.keys():
            for slot_type in self.evidence[q_id]:
                for i in xrange(len(self.evidence[q_id][slot_type])):
                    self.evidence[q_id][slot_type][i] = self.correct_evidence(self.find_query(q_id).name,
                                                                              self.evidence[q_id][slot_type][i])
                    for p_or_c in province_city_list:
                        if len(p_or_c) > 2 and p_or_c in \
                                ''.join(self.evidence[q_id][slot_type][i].parse_result['text']):
                            self.evidence[q_id][slot_type][i] = \
                                self.correct_evidence(p_or_c, self.evidence[q_id][slot_type][i])

        print('Done')
Пример #28
0
    def country(self, slot_type, evidence_slot_type):
        current_output = self.query_answer.output[slot_type]

        province = None

        # find query's province and city answer.
        for line_output in self.query_answer.output[evidence_slot_type]:
            if line_output.slot_filler:
                province = line_output
        if province is None:
            return current_output

        # infer country by province
        country = ''
        evidence = ''  # evidence is a LineOutput object
        state_slot_filler = jianfan.ftoj(province.slot_filler)
        for c in self.world_coutry_province:
            if state_slot_filler in self.world_coutry_province[c]:
                country = c
                evidence = province
                break

        # if inference fails, return original answer
        if not country:
            return current_output

        # search provenance
        found_doc_path = search(country + state_slot_filler,
                                self.sf_object.lucene_searcher,
                                self.sf_object.lucene_analyzer, 50)

        if not found_doc_path:
            return current_output

        evidence_doc_path = found_doc_path[0]
        # add additional doc to source_doc for visualization
        doc_id = evidence_doc_path.split('/')[-1].strip()
        doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read()
        self.sf_object.query_docs[doc_id] = doc

        wp_beg = doc.find(country + state_slot_filler)
        wp_end = wp_beg + len(country + state_slot_filler) - 1
        sp_beg = wp_beg + doc[wp_beg:wp_end + 1].find(country)
        sp_end = sp_beg + len(country) - 1

        l = LineOutput()
        l.slot_type = slot_type
        l.run_id = self.query_answer.run_id

        p = Provenance()
        p.doc_id = doc_id
        p.beg = wp_beg
        p.end = wp_end
        p.text = country + state_slot_filler
        l.wide_provenance = [p]
        evidence.wide_provenance[0].inference = True
        l.wide_provenance += evidence.wide_provenance  # evidence is a LineOutput object

        l.slot_filler = country

        p = Provenance()
        p.doc_id = doc_id
        p.beg = sp_beg
        p.end = sp_end
        p.text = country
        l.slot_filler_prov = [p]

        l.confidence_score = 1

        # if province is 台湾, coutry should also add 台湾
        if u'台湾' in jianfan.ftoj(province.slot_filler):
            return current_output + [l, province]

        return current_output + [l]
Пример #29
0
    def country(self, slot_type, evidence_slot_type):
        current_output = self.query_answer.output[slot_type]

        province = None

        # find query's province and city answer.
        for line_output in self.query_answer.output[evidence_slot_type]:
            if line_output.slot_filler:
                province = line_output
        if province is None:
            return current_output

        # infer country by province
        country = ''
        evidence = ''  # evidence is a LineOutput object
        state_slot_filler = jianfan.ftoj(province.slot_filler)
        for c in self.world_coutry_province:
            if state_slot_filler in self.world_coutry_province[c]:
                country = c
                evidence = province
                break

        # if inference fails, return original answer
        if not country:
            return current_output

        # search provenance
        found_doc_path = search(country + state_slot_filler,
                                self.sf_object.lucene_searcher, self.sf_object.lucene_analyzer, 50)

        if not found_doc_path:
            return current_output

        evidence_doc_path = found_doc_path[0]
        # add additional doc to source_doc for visualization
        doc_id = evidence_doc_path.split('/')[-1].strip()
        doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read()
        self.sf_object.query_docs[doc_id] = doc

        wp_beg = doc.find(country + state_slot_filler)
        wp_end = wp_beg + len(country + state_slot_filler) - 1
        sp_beg = wp_beg + doc[wp_beg:wp_end+1].find(country)
        sp_end = sp_beg + len(country) - 1

        l = LineOutput()
        l.slot_type = slot_type
        l.run_id = self.query_answer.run_id

        p = Provenance()
        p.doc_id = doc_id
        p.beg = wp_beg
        p.end = wp_end
        p.text = country+state_slot_filler
        l.wide_provenance = [p]
        evidence.wide_provenance[0].inference = True
        l.wide_provenance += evidence.wide_provenance  # evidence is a LineOutput object

        l.slot_filler = country

        p = Provenance()
        p.doc_id = doc_id
        p.beg = sp_beg
        p.end = sp_end
        p.text = country
        l.slot_filler_prov = [p]

        l.confidence_score = 1

        # if province is 台湾, coutry should also add 台湾
        if u'台湾' in jianfan.ftoj(province.slot_filler):
            return current_output+[l, province]

        return current_output+[l]
Пример #30
0
    def stateorprovince(self, slot_type, evidence_slot_type):
        current_output = self.query_answer.output[slot_type]

        city = None

        # find query's city answer.
        for line_output in self.query_answer.output[evidence_slot_type]:
            if line_output.slot_filler:
                city = line_output
        if city is None:
            return current_output

        # infer province by city
        province = ''
        evidence = ''  # evidence is a LineOutput object
        city_slot_filler = city.slot_filler
        city_slot_filler = jianfan.ftoj(city_slot_filler)
        for r in [u'区', u'县', u'市']:
            city_slot_filler = city_slot_filler.replace(r, '')

        for p in self.china_province_city:
            if province:
                break
            if p['type'] == 0:
                if city_slot_filler in [item['name'] for item in p['sub']]:
                    province = p['name']
                    evidence = city
                    break
            else:
                for c in p['sub']:
                    if city_slot_filler in [item['name'] for item in c['sub']]:
                        province = p['name']
                        evidence = city
                        break

        # if inference fails, return original answer
        if not province:
            return current_output

        # search provenance
        found_doc_path = search(province + city_slot_filler, self.searcher,
                                self.analyzer, 50)

        if not found_doc_path:
            return current_output

        evidence_doc_path = found_doc_path[0]
        # add additional doc to source_doc for visualization
        doc_id = evidence_doc_path.split('/')[-1].strip()
        doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read()
        self.sf_object.query_docs[doc_id] = doc

        wp_beg = doc.find(province + city_slot_filler)
        wp_end = wp_beg + len(province + city_slot_filler) - 1
        sp_beg = wp_beg + doc[wp_beg:wp_end + 1].find(province)
        sp_end = sp_beg + len(province) - 1

        l = LineOutput()
        l.slot_type = slot_type
        l.run_id = self.query_answer.run_id

        p = Provenance()
        p.doc_id = doc_id
        p.beg = wp_beg
        p.end = wp_end
        p.text = province + city_slot_filler
        l.wide_provenance = [p]
        evidence.wide_provenance[0].inference = True
        l.wide_provenance += evidence.wide_provenance  # evidence is a LineOutput object

        l.slot_filler = province

        p = Provenance()
        p.doc_id = doc_id
        p.beg = sp_beg
        p.end = sp_end
        p.text = province
        l.slot_filler_prov = [p]

        l.confidence_score = 1

        return current_output + [l]
Пример #31
0
# -*- coding:utf-8
import sys
from jianfan import jtof, ftoj

f = open(sys.argv[1])
s = ftoj(("".join(f.readlines())).decode("utf-8"))
s2 = s.split(" ")
f.close()
f2 = open(sys.argv[2], "w")
f2.write(("%s\t%s" % (s2[0], int(sys.argv[4]))).encode("utf-8"))
f2.close()
f3 = open(sys.argv[3], "w")
f3.write(("%s|||%s|||%s" % (s2[1], s2[2], s2[3])).encode("utf-8"))
f3.close()