def scanWikiExtractorFile(fileName):
    print "reading file ", fileName
    with open(fileName, 'r') as f:
        content = f.read()
        articles = content.split('</doc>')
        for article in articles[:-1]:
            # print article
            try:
                soup = BeautifulStoneSoup(article)
            except UnicodeEncodeError:
                print "UnicodeEncodeError"
                pass
                continue

            title = soup.find('doc')['title']
            text = soup.getText()
            # Now title can contain '/', ':' and all such characters which make file names invalid
            for c in invalid_filename_chars:
                if c in title:
                    title = title.replace(c, ' ')

            print title
            hindiArtName = title
            g = codecs.open(HINDI_SAVE_DIR + '/' + hindiArtName,
                            'w',
                            encoding='utf-8-sig')
            g.write(text)
            g.close()
            # break
    return
Пример #2
0
def scanWikiExtractorFile(fileName):
    global comparableFileNumber
    global comparableMap
    print "reading file ", fileName
    with open(fileName, 'r') as f:
        content = f.read()
        articles = content.split('</doc>')
        for article in articles[:-1]:
            # print article
            try:
                soup = BeautifulStoneSoup(article)
            except UnicodeEncodeError:
                print "UnicodeEncodeError"
                pass
                continue

            title = soup.find('doc')['title']
            if title in comparableMap:  # this english article has a corresponding hindi article

                print title, 'found'

                comparableFileNumber += 1
                text = soup.getText()

                # Why to go into pain of renaming title since we are not storing file by title anyways !!
                # Now title can contain '/', ':' and all such characters which make file names invalid
                # for c in invalid_filename_chars:
                #     if c in title:
                #         title = title.replace(c, ' ')
                # print title
                # engArtName = title

                g = codecs.open(CORRESPONDING_ENGLISH_SAVE_DIR+'/'+'en_' + str(comparableFileNumber), 'w', encoding='utf-8-sig')
                g.write(text)
                g.close()

                # now we need to move the comparable hindi article into the CORRESPONDING_HINDI_SAVE_DIR

                # But why didn't we move all hindi articles existing in the comparableMap into CORRESPONDING_HINDI_SAVE_DIR
                # anyways instead of doing it one by one now.
                # This is because all those hindi ids may not have their corresponding english articles in this wiki dump
                # Moreover, since we have to document align english with hindi, so we would first have had to store
                # the comparableFileNumber beforehand for all hindi english articles and use it here instead of generating
                # a new one. It could have led to gaps in b/w if some english articles could not be found !

                hindiId = comparableMap[title]
                shutil.copyfile(HINDI_ARTICLES_DIR+'/'+hindiId, CORRESPONDING_HINDI_SAVE_DIR+'/'+'hi_'+str(comparableFileNumber))

                # Now remove this mapping from dictionary since all keys are unique, so we won't get this again.
                # But removing keys would definitely reduce size of map and fasten up the worst case lookup time
                del comparableMap[title]

    return
Пример #3
0
def sentenceToWordlistHindi(sentence, remove_stopwords=False):
    soup = BeautifulStoneSoup(sentence)
    if soup is not None:
        sentence_text = soup.getText()
    else:
        print "soup has not been cooked yet !"
        return []

    words = sentence_text.split()

    # if IF_STEMMING:                           # don't do stemming here, shallow parser is worthless !
    #     words = stemHindi(sentence_text)

    if remove_stopwords:
        stops = set(stopwords.words("hindi"))
        words = [w for w in words if not w in stops]

    return words
def sentenceToWordlistHindi(sentence, remove_stopwords=False):
    soup = BeautifulStoneSoup(sentence)
    if soup is not None:
        sentence_text = soup.getText()
    else:
        print "soup has not been cooked yet !"
        return []

    words = sentence_text.split()

    # if IF_STEMMING:                           # don't do stemming here, shallow parser is worthless !
    #     words = stemHindi(sentence_text)


    if remove_stopwords:
        stops = set(stopwords.words("hindi"))
        words = [w for w in words if not w in stops]

    return words
Пример #5
0
def sentenceToWordlistEnglish(sentence, remove_stopwords=False):
    soup = BeautifulStoneSoup(sentence)
    if soup is not None:
        sentence_text = soup.getText()
    else:
        print "soup has not been cooked yet !"
        return []

    # 2. Remove non-letters
    # sentence_text = re.sub("[^a-zA-Z]"," ", sentence_text)

    words = sentence_text.lower().split()

    if IF_STEMMING:
        for i in range(len(words)):
            words[i] = ENstemmer.stem(words[i])

    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return (words)
def sentenceToWordlistEnglish(sentence, remove_stopwords=False):
    soup = BeautifulStoneSoup(sentence)
    if soup is not None:
        sentence_text = soup.getText()
    else:
        print "soup has not been cooked yet !"
        return []

    # 2. Remove non-letters
    # sentence_text = re.sub("[^a-zA-Z]"," ", sentence_text)

    words = sentence_text.lower().split()

    if IF_STEMMING:
        for i in range(len(words)):
            words[i] = ENstemmer.stem(words[i])


    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return(words)
Пример #7
0
Файл: ygd.py Проект: tuxdna/ygd
def download_message(message_id, message_path, yahoo_group):
    mkdir_p(message_path)
    header_filepath = message_path+'/header'
    body_filepath = message_path+'/body'
    na_filepath = message_path+'/na'
    allhtml_filepath = message_path+'/all_html'

    if os.path.exists(message_path):
        if os.path.exists(na_filepath):
            return

    if os.path.exists(header_filepath) and os.path.exists(body_filepath):
        return

    msg_url = '%s/%s/message/%s?source=1&unwrap=1'%(YG_BASE_URL,yahoo_group,message_id)
    # sleep_duration = HUMAN_WAIT + random.randint(0,HUMAN_REFLEX)
    # if VERBOSE and sleep_duration:
    #     print ".... sleep % .... "%sleep_duration
    # time.sleep(sleep_duration)
    tc.go(msg_url)
    b = tc.get_browser()
    html = b.get_html()
    pattern_invalid = re.compile("Message (%s)? does not exist in %s"%(message_id, yahoo_group))
    m0 = re.search(pattern_invalid, b.get_html())

    f = open(allhtml_filepath, 'w')
    f.write(b.get_html())
    f.close()

    if m0:
        print "Message %s doesn't exist"%message_id
        f = open(na_filepath, 'w')
        f.close()
        return

    pattern_content = re.compile(r'<!-- start content include -->\s(.+?)\s<!-- end content include -->', re.DOTALL)
    m1 = re.search(pattern_content, html)

    if not m1:
        print "invalid format: html"
        return

    email_content = m1.group(1)
    mysoup = BeautifulSoup(email_content)
    source_content = mysoup.find('td', {'class': 'source user'}).__repr__()
    source_content = unicode(source_content, 'utf-8', errors='replace')
    source_content = source_content.encode('utf-8')

    m2 = re.search(re.compile(r'\s+(From .+?\s*)?<br />\s+<br />\s+(.+)</td>',re.DOTALL), source_content)

    if not m2:
        print "invalid format: email_content"
        f = open("source_content", 'w')
        f.write(source_content)
        f.close()
        sys.exit(1)
        return

    email_header = m2.group(1)
    new_header_lines = []
    for l in email_header.split('\n'):
        nl = re.sub(r'<a href=".+?>(.+?)<\/a>', lambda m: m.group(1), l)
        nl = re.sub(r'<br />$', '', nl)
        nl = BeautifulStoneSoup(nl, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
        nl = nl.getText()
        new_header_lines.append(nl)

    email_header = '\n'.join(new_header_lines)

    email_body = m2.group(2)
    new_body_lines = []
    for l in email_body.split('\n'):
        nl = re.sub(r'<a href=".+?>(.+?)<\/a>', lambda m: m.group(1), l)
        nl = re.sub(r'<br />$', '', nl)
        nl = BeautifulStoneSoup(
            nl,
            convertEntities=BeautifulStoneSoup.HTML_ENTITIES
            )
        nl = nl.getText()
        new_body_lines.append(nl)

    email_body = '\n'.join(new_body_lines)

    f_header = open(header_filepath, 'w')
    f_header.write(email_header)
    f_header.close()

    f_body = open(body_filepath, 'w')
    email_body= email_body.encode('utf-8')
    f_body.write(email_body)
    f_body.close()