Exemplo n.º 1
0
def gen_entity_mentions_map(wiki_path, name2wikiid, wikiid2id, ent_max):

    hyp_pattern = re.compile(r'<a[^>]*href=\"([^\">]+)\"[^>]*>([^>]+)</a>',
                             re.DOTALL | re.UNICODE)

    ignored_names = set()
    wikiid2mentions = {}

    with open(wiki_path, 'r', encoding='utf8') as inf:

        for line in inf:

            if len(wikiid2mentions) % 100 < 10:
                show_progress(percent=len(wikiid2mentions) / ent_max)

            clean_text = wikicorpus.filter_wiki(line)
            hyp_matches = re.finditer(hyp_pattern, line)

            for link in hyp_matches:

                name = unquote(link.groups()[0])  # wikipedia url id
                mention = link.groups()[1]

                if name in name2wikiid:
                    wikiid = name2wikiid[name]
                    if wikiid not in wikiid2mentions:
                        wikiid2mentions[wikiid] = []
                    wikiid2mentions[wikiid].append(mention)
                else:
                    ignored_names.add(name)

    return wikiid2mentions
def preprocess_wikidata(raw):
 # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower().split('../img/')[0]
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return (tokens, text)
Exemplo n.º 3
0
def fetch_wiki_texts(in_file,
                     namespaces_to_filter=WIKI_DEFAULT_NAMESPACES_TO_FILTER,
                     min_text_length=200):
    return ((title, clean_text, page_id) for title, text, page_id in
            extract_pages(bz2.BZ2File(in_file), namespaces_to_filter)
            for clean_text in (filter_wiki(text), )
            if len(clean_text.strip()) >= min_text_length)
Exemplo n.º 4
0
def preprocess(raw):
    # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower()
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i) > 2]

    return tokens
Exemplo n.º 5
0
def process_article(args):
    title = args[0]
    text = filter_wiki(args[1])
    text = utils.to_unicode(text, encoding='utf8', errors="strict")
    text = text.replace('\n', ' ')

    pageid = args[2]
    return title, text, pageid
Exemplo n.º 6
0
 def get_text(self):
     result = ""
     elelist = self.dom.getElementsByTagName('rev')
     if elelist.length is not 0:
         ele = elelist[0]
         s = filter_wiki(ele.childNodes[0].data).encode('sjis', 'ignore')
         result = re.sub(r'[^a-zA-Z ]', '', s.decode('sjis', 'ignore')).lower()
     return result
Exemplo n.º 7
0
def _get_plaintext(content):
    return WIKI_CRUFT_RE.sub(
        r'',
        WIKI_NEWLINE_RE.sub(
            r'\n',
            WIKI_HEADER_RE.sub(r'\1',
                               WIKI_QUOTE_RE.sub(
                                   r'', filter_wiki(content))))).strip()
Exemplo n.º 8
0
def segment(page_xml, include_interlinks=False):
    """Parse the content inside a page tag

    Parameters
    ----------
    page_xml : str
        Content from page tag.

    include_interlinks : bool
        Whether or not interlinks should be parsed.

    Returns
    -------
    (str, list of (str, str), (Optionally) dict of (str: str))
        Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}).

    """
    elem = cElementTree.fromstring(page_xml)
    filter_namespaces = ('0', )
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    id_path = "./{%(ns)s}id" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    lead_section_heading = "Introduction"
    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
    top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"

    title = elem.find(title_path).text
    articleID = elem.find(id_path).text
    text = elem.find(text_path).text
    ns = elem.find(ns_path).text
    if ns not in filter_namespaces:
        text = None

    if text is not None:
        if include_interlinks:
            interlinks = find_interlinks(text)
        section_contents = re.split(top_level_heading_regex, text)
        section_headings = [lead_section_heading] + re.findall(
            top_level_heading_regex_capture, text)
        section_headings = [heading.strip() for heading in section_headings]
        assert len(section_contents) == len(section_headings)
    else:
        interlinks = []
        section_contents = []
        section_headings = []

    section_contents = [
        filter_wiki(section_content) for section_content in section_contents
    ]
    sections = list(zip(section_headings, section_contents))

    if include_interlinks:
        return title, sections, interlinks, articleID
    else:
        return title, sections, articleID
def iter_wiki(dump_file): # making a wiki token stream
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
            continue  # ignore short articles and various meta-articles
        yield tokens
 def iter_wiki(self):
     """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
     ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
     for title, text, pageid in _extract_pages(smart_open(self.dump_file)):
         text = filter_wiki(text)
         tokens = [token for token in simple_preprocess(text) if token not in STOPWORDS]
         if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
             continue  # ignore short articles and various meta-articles
         yield title, tokens
Exemplo n.º 11
0
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, title, pageid = args
    text = filter_wiki(text)
    tokens = extract_jp_entities(text) if text else []
    return tokens, title, pageid
Exemplo n.º 12
0
def wiki_docs(dir="data/simple_wiki"):
    """
    :param path:
    :return:
    """
    for filename in os.listdir(os.path.join(BASE_DIR, dir)):
        with open(os.path.join(BASE_DIR, dir, filename)) as f:
            doc = filter_wiki(f.read())
            yield doc
Exemplo n.º 13
0
def process_article(args):
    # override original method in wikicorpus.py
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    # result = utils.lemmatize(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid
def iter_wiki(dump_file):
    ignore_namespaces = "Wikipedia Category File Portal Template MediaWiki User Help Book Draft".split(
    )
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(
                title.startswith(ns + ':') for ns in ignore_namespaces):
            continue
        yield title, tokens
Exemplo n.º 15
0
def make_corpus(args):
    # Create new, empty file for the corpus
    wikidump_file = args.dump_file
    lang_code = wikidump_file.split("/")[-1][:2]
    output_filename = args.output_file
    f = open(output_filename, "w+")
    f.close()
    nlp = stanfordnlp.Pipeline(processors="tokenize",
                               lang=lang_code,
                               models_dir="/u/nlp/data/stanfordnlp_resources/")
    total_tokens = 0
    checkpoint = 100000
    for event, elem in etree.iterparse(args.dump_file,
                                       events=('start', 'end', 'start-ns',
                                               'end-ns')):
        if event == 'end':
            if elem.tag == add_ns("page"):
                ns = elem.find(add_ns("ns"))
                if ns is not None and ns.text == "0":
                    revision = elem.find(add_ns("revision"))
                    if revision is None:
                        continue
                    text_elem = revision.find(add_ns("text"))
                    if text_elem is None:
                        continue
                    text = text_elem.text
                    if text is None:
                        continue
                    text = wikicorpus.filter_wiki(text)
                    text = text.lower()
                    try:
                        sentences = nlp(text).sentences
                    except Exception:
                        continue
                    article_len = sum([len(sent.words) for sent in sentences])
                    if article_len > int(args.min_tokens_for_article):
                        for sentence in sentences:
                            words = [word.text for word in sentence.words]
                            # Take out heading words, that usually appear as ==heading==
                            words = [
                                word for word in words if "==" not in word
                            ]
                            if len(words) > 5:
                                total_tokens += len(words)
                                line = " ".join(words)
                                with open(output_filename, "a+") as outfile:
                                    outfile.write(line + "\n")
        if total_tokens >= checkpoint:
            print(f"At {total_tokens} tokens")
            checkpoint += 100000
        if total_tokens >= int(args.max_tokens):
            print("Reached max tokens! We're at {0}.".format(total_tokens))
            return
    print(f"Finished corpus with {total_tokens} tokens!")
Exemplo n.º 16
0
def wiki_replace(d):
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('<gallery>[\s\S]*?</gallery>', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
    s = filter_wiki(s)
    s = re.sub('\* *\n|\'{2,}', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n[:;]|\n +', '\n', s)
    s = re.sub('\n==', '\n\n==', s)
    return s
Exemplo n.º 17
0
def extract_text_content(xml_dump):
    # article_count = 0
    with open('wiki.en.txt', 'w') as file:
        for title, content, pageid in tqdm(extract_pages(xml_dump)):
            try:
                file.write(filter_wiki(content).strip() + "\n")
                # article_count += 1
                # if article_count % 10000 == 0:
                #     logging.info(f'{article_count} articles processed')
            except Exception as e:
                logging.warning(str(e))
Exemplo n.º 18
0
    def get_texts(self):
        length = 0

        for _, _, text in process_data(self.input):
            length += 1

            yield [tok for tok in self.tokenizer(filter_wiki(text)) if tok not in self.stopwords]

            if self.limit and length >= self.limit:
                break

            self.length = length
def iter_wiki(dump_file, n=-1):
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    counter = 0
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        counter += 1
        if counter == n:
            break
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(title.startswith(ns+':') for ns in ignore_namespaces):
            continue
        yield title, tokens
Exemplo n.º 20
0
def segment(page_xml, include_interlinks=False):
    """Parse the content inside a page tag

    Parameters
    ----------
    page_xml : str
        Content from page tag.

    include_interlinks : bool
        Whether or not interlinks should be parsed.

    Returns
    -------
    (str, list of (str, str), (Optionally) dict of (str: str))
        Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}).

    """
    elem = cElementTree.fromstring(page_xml)
    filter_namespaces = ('0',)
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    lead_section_heading = "Introduction"
    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
    top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"

    title = elem.find(title_path).text
    text = elem.find(text_path).text
    ns = elem.find(ns_path).text
    if ns not in filter_namespaces:
        text = None

    if text is not None:
        if include_interlinks:
            interlinks = find_interlinks(text)
        section_contents = re.split(top_level_heading_regex, text)
        section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text)
        section_headings = [heading.strip() for heading in section_headings]
        assert len(section_contents) == len(section_headings)
    else:
        interlinks = []
        section_contents = []
        section_headings = []

    section_contents = [filter_wiki(section_content) for section_content in section_contents]
    sections = list(zip(section_headings, section_contents))

    if include_interlinks:
        return title, sections, interlinks
    else:
        return title, sections
Exemplo n.º 21
0
def wiki_replace(d):
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('[\s\S]*?', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
    s = filter_wiki(s)
    s = re.sub('\* *\n|\'{2,}', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n[:;]|\n +', '\n', s)
    s = re.sub('\n==', '\n\n==', s)
    #cc = opencc.OpenCC('mix2s')
    #return cc.convert(s).strip()
    return s
Exemplo n.º 22
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)

    filter_namespaces = ('0', )

    out_i = 0
    for in_fname in glob.glob(args.inglob):
        for title, text, pageid in extract_pages(
                bz2.BZ2File(in_fname), filter_namespaces=filter_namespaces):
            if out_i % args.skip == 0:
                with open(os.path.join(args.outdir, f'{pageid}.txt'),
                          'w') as f:
                    f.write(filter_wiki(text))
            out_i += 1
Exemplo n.º 23
0
 def _clean(self, d):
     s = d[1]
     s = re.sub(':*{\|[\s\S]*?\|}', '', s)
     s = re.sub('<gallery>[\s\S]*?</gallery>', '', s)
     s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '', s)
     s = filter_wiki(s)
     s = re.sub('\* *\n|\'{2,}', '', s)
     s = re.sub('\n+', '\n', s)
     s = re.sub('\n[:;]|\n +', '\n', s)
     s = re.sub('(==+)', '\n', s)
     if self.s2t is True:
         return cc.convert(d[0]).strip(), cc.convert(s).strip()
     else:
         return d[0].strip(), s.strip()
Exemplo n.º 24
0
Arquivo: wiki.py Projeto: hans/deepBLE
def process_article(args):
    """Parse a Wikipedia article, returning its content as a list of
    sentences (each a list of utf8-encoded token strings).
    """

    text, do_lemmatize, title, pageid = args
    text = filter_wiki(text)

    process_fn = utils.lemmatize if do_lemmatize else tokenize
    sentences = []
    for sentence in re.split(SENTENCE_BOUNDARY, text):
        if not sentence:
            continue
        sentences.append(process_fn(sentence))

    return sentences, title, pageid
Exemplo n.º 25
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-e', '--encoding')
    parser.add_argument('-t', '--tokenization', default='none')
    args = parser.parse_args()

    encoding = args.encoding
    tokenization = args.tokenization

    if encoding:
        sys.stdout = codecs.getwriter(encoding)(sys.stdout)
        sys.stdin = codecs.getreader(encoding)(sys.stdin)

    for text in sys.stdin:
        text = filter_wiki(text).strip()

        if tokenization != 'none':
            if tokenization == 'alpha':
                tokens = tokenize(text, tok_type='alpha', lower=False)
            elif tokenization == 'alpha-lower':
                tokens = tokenize(text, tok_type='alpha', lower=True)

            elif tokenization == 'all':
                tokens = tokenize(text, tok_type='all', lower=False, norm_num=True)
            elif tokenization == 'all-lower':
                tokens = tokenize(text, tok_type='all', lower=True, norm_num=True)
            elif tokenization == 'all-num':
                tokens = tokenize(text, tok_type='all', lower=False, norm_num=False)
            elif tokenization == 'all-lower-num':
                tokens = tokenize(text, tok_type='all', lower=True, norm_num=False)

            elif tokenization == 'nopunct':
                tokens = tokenize(text, tok_type='nopunct', lower=False, norm_num=True)
            elif tokenization == 'nopunct-lower':
                tokens = tokenize(text, tok_type='nopunct', lower=True, norm_num=True)
            elif tokenization == 'nopunct-num':
                tokens = tokenize(text, tok_type='nopunct', lower=False, norm_num=False)
            elif tokenization == 'nopunct-lower-num':
                tokens = tokenize(text, tok_type='nopunct', lower=True, norm_num=False)
            else:
                raise NotImplementedError

            text = ' '.join(tokens)

        if PAT_ALPHABETIC.match(text):
            sys.stdout.write(text + '\n')
Exemplo n.º 26
0
def prepare_data(filename, destname):
    pages = wc.extract_pages(bz2.BZ2File(filename), ('0',))
    corpus = []
    x = []
    y = []
    count = 0
    for p in pages:
        text = wc.filter_wiki(p[1])
        tokens = [token.encode('utf8') for token in utils.tokenize(text, errors='ignore')
                  if len(token) <= 15 and not token.startswith('_')]
        if len(tokens) >= 50:
            length = 0
            old_i = 0
            for i, token in enumerate(tokens):
                length += len(token)
                if length > MAX_CHAR_LENGTH:
                    corpus.append(tokens[old_i: i])
                    length = len(token)
                    old_i = i
                if i == len(tokens) - 1:
                    corpus.append(tokens[i:])
    count = 0
    for sent in corpus:
        count += 1
        if count >= 100000:
            break
        sent_y = []
        sent_x = []
        for token in sent:
            if all([65 <= c <= 90 or 97 <= c <= 122 for c in token]):
                sent_y.extend([False] * (len(token) - 1) + [True])
                sent_x.extend([c - 64 if c <= 90 else c - 70 for c in token])
        sent_y.extend([False] * (MAX_CHAR_LENGTH - len(sent_x)))
        sent_x.extend([0] * (MAX_CHAR_LENGTH - len(sent_x)))
        y.append(sent_y)
        x.append(sent_x)
        if len(sent_x) != MAX_CHAR_LENGTH:
            print(len(sent_x))

    x = np.array(x)
    y = np.array(y)
    pickle.dump(x, open(os.path.abspath(destname + '_x'), 'wb'))
    pickle.dump(y, open(os.path.abspath(destname + '_y'), 'wb'))
Exemplo n.º 27
0
def segment(page_xml):
    """Parse the content inside a page tag

    Parameters
    ----------
    page_xml : str
        Content from page tag.

    Returns
    -------
    (str, list of (str, str))
        Structure contains (title, [(section_heading, section_content)]).

    """
    elem = cElementTree.fromstring(page_xml)
    filter_namespaces = ('0',)
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    lead_section_heading = "Introduction"
    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
    top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"

    title = elem.find(title_path).text
    text = elem.find(text_path).text
    ns = elem.find(ns_path).text
    if ns not in filter_namespaces:
        text = None

    if text is not None:
        section_contents = re.split(top_level_heading_regex, text)
        section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text)
        assert len(section_contents) == len(section_headings)
    else:
        section_contents = []
        section_headings = []

    section_contents = [filter_wiki(section_content) for section_content in section_contents]
    sections = list(zip(section_headings, section_contents))
    return title, sections
Exemplo n.º 28
0
def extract_first_sentence(text):
    # extraction section 0
    summary_content = text.encode('UTF-8').decode('UTF-8')
    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
    summary_content = re.split(top_level_heading_regex, summary_content)[0]
    summary_content = filter_wiki(summary_content)
    summary_content = re.sub(r"'''", "", summary_content)
    summary_content = re.sub(r'\\n', ' ', summary_content)
    summary_content = re.sub(r'\\', '', summary_content)
    summary_content = re.sub(r"(\(.*?\))", "", summary_content)
    summary_content = summary_content.strip()
    sents = sent_tokenize(summary_content)
    if len(sents) < 1:
        return ""
    first = sents[0]
    if sents[0] is "." or sents[0] is "" or sents[0].startswith("See also"):
        if len(sents) > 1:
            first = sents[1]
        else:
            return ""
    return first.strip()
Exemplo n.º 29
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    logger = tools.get_logger('gensim', os.path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # initializations
    articles = {}
    all_missing = []
    redir_on = {}
    collisions = {}
    non_ascii = []
    site = mwclient.Site('en.wikipedia.org', '/w/api.php/')

    # get all txt files in a folder and iterate over them
    filelist = glob.glob(os.path.join(base_path,
                                      p['folder_path'],
                                      "*.txt"))
    for f in filelist:

        # get the word we are working on
        f_name = os.path.basename(f)
        k_word = os.path.splitext(f_name)[0]
        logger.info("working on file: %s" % f_name)

        # try to convert the word into ascii for the http query
        file_obj = codecs.open(f, "r", "utf-16")
        counter = 0
        words = []
        for w in file_obj.readlines():
            try:
                s = w.strip().decode('ascii')
                words.append(s)
            except Exception:
                counter += 1
                non_ascii.append(w.strip())
        logger.info("\t%d words containing non ascii are ommited" % counter)

        articles[k_word] = {}
        logger.info("\tfound %d words in file" % len(words))

        for word in words:
            data = {}
            page = site.Pages[word]

            # follow the redirect and check for collisions
            if page.redirect:
                res = re.search('\[\[(.+)\]\]', page.edit())
                redir_word = urllib.unquote(res.groups()[0])
                if redir_word in redir_on:
                    logger.warning("[%s AND %s] both redirect on --> %s" %
                                    (word, redir_on[redir_word], redir_word))
                    collisions[redir_word] = redir_on[redir_word]
                else:
                    logger.info("[%s] redir from [%s]" % (redir_word, word))
                    redir_on[redir_word] = word
                text = site.Pages[redir_word].edit()
                data['redirected'] = redir_word

            else:
                text = page.edit()

            # check for missing wikipedia articles
            if  text == "":
                all_missing.append(word)
                continue

            # preprocess the received article
            data['text'] = wikicorpus.filter_wiki(text)
            in_ascii = ud.normalize('NFKD',
                                    data['text']).encode('ascii', 'ignore')
            data['text'] = preprocess_string(in_ascii)
            articles[k_word][word] = data

    logger.info('add human rating to the articles')
    id_word = {}
    sparql_path = os.path.join(base_path, p['sparql_path'])
    with open(os.path.join(sparql_path, 'id_word.txt')) as f:
        for line in f.readlines():
            idx, word = line.strip().split('\t')
            id_word[idx] = word

    #add human rating to the wikipedia data
    not_found = []
    with open(os.path.join(sparql_path, p['human_file'])) as f:
        for line in f.readlines():
            arr = line.split()
            word = id_word[arr[0]]
            term = arr[3]
            try:
                articles[word][term]['rating'] = int(arr[4])
            except KeyError:
                not_found.append(term)
    logger.info("%d words from the ref queries not found" % len(not_found))

    f = open(os.path.join(output_dir, "articles.pickle"), 'wb')
    pickle.dump(articles, f)
    f.close

    info = {}
    info['missing'] = all_missing
    info['redirs'] = redir_on
    info['collisions'] = collisions
    info['not_found'] = not_found
    info['non_ascii'] = non_ascii
    f = open(os.path.join(output_dir, "info.pickle"), 'wb')
    pickle.dump(info, f)
    f.close

    logger.info("%d redirecting collisions (see info.pkl)" % len(collisions))
Exemplo n.º 30
0
def _get_plaintext(content):
    return CRUFT_RE.sub(r'', NEWLINE_RE.sub(r'\n', HEADER_RE.sub(r'\1', QUOTE_RE.sub(r'', filter_wiki(content))))).strip()
Exemplo n.º 31
0
def tokenize(text):
    return [token for token in simple_preprocess(filter_wiki(text)) if token not in UTF8STP]
Exemplo n.º 32
0
 def _clean_content(self, content):
     return WIKI_CRUFT_RE.sub(
         r'', WIKI_NEWLINE_RE.sub(
             r'\n', WIKI_HEADER_RE.sub(
                 r'\1', WIKI_QUOTE_RE.sub(
                     r'', filter_wiki(content))))).strip()