Пример #1
0
def describe(corpus):
  print "\t".join(["c/w", "w/s", "w/v", "id"])
  for fileid in corpus.fileids():
    nchars = len(corpus.raw(fileid))
    nwords = len(corpus.words(fileid))
    nsents = len(corpus.sents(fileid))
    nvocab = len(set([w.lower() for w in corpus.words(fileid)]))
    print "\t".join([str(nchars/nwords), str(nwords/nsents),
      str(nwords/nvocab), fileid])
Пример #2
0
    def SearchCorpus(self, event):
        count=0

        userword=self.text.GetValue()
        wordsenses=self.ShowSenses(self)
        corpus = LazyCorpusLoader('hamshahricorpus',XMLCorpusReader, r'(?!\.).*\.xml')
        for file in corpus.fileids():

       #if num==1000: break
          for doc in  corpus.xml(file).getchildren():

              cat=doc.getchildren()[3].text#
              text=doc.getchildren()[5].text
              newtext=correctPersianString(text)
              allwords=newtext.split()
             # sents=newtext.split('.'
              if userword in allwords:
                  overlap={}
                  bestsenses=[]
                  wordindx=allwords.index(userword)
                  context=allwords[wordindx-8:wordindx+8]
                  purecontextwords=set(context)-set(stopwords)
                  for i in range(len(wordsenses[userword])):
                        senseid=wordsenses[userword][i][1]
                        glosswords=wordsenses[userword][i][0].split()
                        pureglosswords=set(glosswords)-set(stopwords)
                        common=set(pureglosswords)&set(purecontextwords)
                        if userword in common:
                                 common.remove(userword)
                        overlap[senseid]=len(common)
                        
                  bestoverlap=max(overlap.values())
                  if bestoverlap>0:

                      for item  in overlap.keys():
                          if overlap[item]==bestoverlap:
                              bestsenses.append(item)
                  if len(bestsenses)==1:



                      # if 0 in bestsenses:

                           print ' '.join(context),'\t',bestsenses
                           frame.printarea.Clear()
                           frame.printarea.write(' '.join(context)+'\t'+str(bestsenses))
Пример #3
0
def create_dfs(corpus):
    print("Gathering data..")
    hold_files = corpus.fileids()
    rowlist = []
    for each in hold_files:
        each_row = {}
        each_row['Year'], each_row['Last_name'], _ = each.replace(
            '-', '.').split('.')
        each_row['Text'] = pre_process(
            corpus.raw(each))  # Preprocessed text file
        rowlist.append(each_row)
    print("Creating dataframe..")
    df = pd.DataFrame(rowlist)
    df['Year'] = df['Year'].astype(int)
    tf_idf_df = get_tfidf(df)

    return tf_idf_df, df
def build_tfidf(corpus_dir,model_filename):
    stemmer = nltk.stem.PorterStemmer()
    corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt$')  # a memory-friendly iterator
    dictionary = corpora.Dictionary()

    bigram_transformer = Phrases(TextCorpus(corpus))

    for myfile in corpus.fileids():
        try:
            chunks = bigram_transformer[[word.lower() for word in corpus.words(myfile)]]
            dictionary.add_documents([[stemmer.stem(chunk) for chunk in chunks]])

        except Exception as e:
            print 'Warning error in file:', myfile

    model = TfidfModel(BowCorpus(corpus,dictionary,bigram_transformer), id2word=dictionary)
    model.save(model_filename)
Пример #5
0
def hamshahri_targetword_corpus_maker(match, outpath):
        print 'loading hamshahri corpus'
        print
        corpus = LazyCorpusLoader('hamshahricorpus',XMLCorpusReader, r'(?!\.).*\.xml')
        outfile=codecs.open(outpath,'w','utf-8')
        punclist=[u'،',u'؛',u':',u'؟',u'#']

        matchnum=0
        count =0
        print 'creating target corpus'
        for file in corpus.fileids():
            #print file

            for doc in  corpus.xml(file).getchildren():

            #    print doc.getchildren()
        #          cat=doc.getchildren()[3].text#
                  text=doc.getchildren()[5].text
                  newtext=correctPersianString(text)
                  newtext= newtext.replace('\n',' ')


                  for item in punclist:
                        if item in newtext:
                            newtext=newtext.replace(item,'')
        #
        #        #  print newtext
        #
        #
                  if match in newtext.split():
        #
                        matchnum+=1
                        print newtext
                        print '#'
                        count+=1
        #
                        outfile.write(newtext)
                        outfile.write('ALI')


        outfile.close()
        print count
Пример #6
0
def hamshahri_targetword_corpus_maker(match, outpath):
    print 'loading hamshahri corpus'
    print
    corpus = LazyCorpusLoader('hamshahricorpus', XMLCorpusReader,
                              r'(?!\.).*\.xml')
    outfile = codecs.open(outpath, 'w', 'utf-8')
    punclist = [u'،', u'؛', u':', u'؟', u'#']

    matchnum = 0
    count = 0
    print 'creating target corpus'
    for file in corpus.fileids():
        #print file

        for doc in corpus.xml(file).getchildren():

            #    print doc.getchildren()
            #          cat=doc.getchildren()[3].text#
            text = doc.getchildren()[5].text
            newtext = correctPersianString(text)
            newtext = newtext.replace('\n', ' ')

            for item in punclist:
                if item in newtext:
                    newtext = newtext.replace(item, '')
    #
    #        #  print newtext
    #
    #
            if match in newtext.split():
                #
                matchnum += 1
                print newtext
                print '#'
                count += 1
                #
                outfile.write(newtext)
                outfile.write('ALI')

    outfile.close()
    print count
def ham_corpus_maker(outpath, word):
        corpus = LazyCorpusLoader('hamshahricorpus',XMLCorpusReader, r'(?!\.).*\.xml')
        outfile=codecs.open(outpath,'w','utf-8')
        count=0
        instancenum=0
        targetwordnum=0
        for file in corpus.fileids():
            #print file

            for doc in  corpus.xml(file).getchildren():

               # print doc.getchildren()
                   cat=doc.getchildren()[3].text#
                   text=doc.getchildren()[5].text
                   newtext=correctPersianString(text)
                   newtext= newtext.replace('\n',' ')
                   textlines= newtext.split('.')
                   if word in newtext.split():
                       print newtext
                       outfile.write(newtext)
                       outfile.write('\n')
                       print
                       print







        print str(instancenum)+" seeds found "
        print str(targetwordnum)+" target word found "




        outfile.close()
Пример #8
0
for item in c:
   for word in words:
      item[1]= correctPersianString(item[1])
      item[0]= correctPersianString(item[0])


      if word==item[0]:
         mixed=item[1]+' '+item[2]
         wordsense.setdefault(item[0], []).append((mixed,item[3]))

anothercount=0
total=0
nooverlap=0
num=0
commonwords={}
for file in corpus.fileids():
  
   #if num==1000: break
   for doc in  corpus.xml(file).getchildren():

          cat=doc.getchildren()[3].text#
         # print cat
          text=doc.getchildren()[5].text
          newtext=correctPersianString(text)
          allwords=newtext.split()
          sents=newtext.split('.')
          for word in words:
           # print word
            for sent in sents:
                 if word in sent.split():
                 # print sent
corpusname = "inaugural"
if len(sys.argv) >= 2:
    corpusname = sys.argv[1]

filelim = 4
if len(sys.argv) >= 3:
    filelim = int(sys.argv[2])

corpus = getattr(nltk.corpus, corpusname)


def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


path = "./%s" % corpusname
mkdir_p(path)

for i in range(0, filelim):
    fid = corpus.fileids()[i]
    with open("%s/%s" % (path, fid), 'w') as out:
        # need to remove new lines here so MR interprets each file
        # as a single input
        out.write(corpus.raw(fid).replace('\n', ' '))
corpusname = "inaugural"
if len(sys.argv) >= 2:
    corpusname = sys.argv[1]

filelim = 4
if len(sys.argv) >= 3:
    filelim = int(sys.argv[2])

corpus = getattr(nltk.corpus, corpusname)


def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

path = "./%s" % corpusname
mkdir_p(path)


for i in range(0, filelim):
    fid = corpus.fileids()[i]
    with open("%s/%s" % (path, fid), 'w') as out:
        # need to remove new lines here so MR interprets each file
        # as a single input
        out.write(corpus.raw(fid).replace('\n', ' '))
Пример #11
0
#
#    print ''.join(ite[1]
print "real final"
#m)
#

words = [u'شیر']

total = 0
nooverlap = 0
num = 0
number = 0
commonwords = {}
#for i in realfinal:
#    print i, ' '.join(set(realfinal[i]))
for file in corpus.fileids():
    #
    #   #if num==1000: break
    for doc in corpus.xml(file).getchildren():
        #
        cat = doc.getchildren()[3].text  #
        text = doc.getchildren()[5].text
        newtext = correctPersianString(text)
        allwords = text.split()
        sents = newtext.split('.')
        #          for word in words:
        #           # print word
        for sent in sents:
            if word in sent.split():
                total += 1
                overlap = {}
Пример #12
0
def contents(corpus):
    return corpus.fileids()
Пример #13
0
def contents(corpus):
  return corpus.fileids()
Пример #14
0
        os.remove(os.path.join(root, fileid))
        os.rmdir(root)


# plaintext corpus reader
root = make_testcorpus(ext='.txt',
                       a="""
                       This is the first sentence.  Here is another
                       sentence!  And here's a third sentence.

                       This is the second paragraph.  Tokenization is currently
                       fairly simple, so the period in Mr. gets tokenized.
                       """,
                       b="""This is the second file.""")
corpus = PlaintextCorpusReader(root, ['a.txt', 'b.txt'])
print(corpus.fileids())
corpus = PlaintextCorpusReader(root, '.*\.txt')
print(corpus.fileids())
print(str(corpus.root) == str(root))
print(corpus.words())
print(corpus.raw()[:40])
print(len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()])
print(corpus.words('a.txt'))
print(corpus.words('b.txt'))
print(corpus.words()[:4], corpus.words()[-4:])
# del_testcorpus(root)
for corpus in (abc, genesis, inaugural, state_union, webtext):
    print(str(corpus).replace('\\\\', '/'))
    print('  ', repr(corpus.fileids())[:60])
    print('  ', repr(corpus.words()[:10])[:60])
root = make_testcorpus(a="""