示例#1
0
    def _convertBTrees(self, threshold=200):
        Lexicon._convertBTrees(self, threshold)
        if type(self._digrams) is OOBTree: return

        from BTrees.convert import convert

        _digrams = self._digrams
        self._digrams = OOBTree()
        self._digrams._p_jar = self._p_jar
        convert(_digrams, self._digrams, threshold, IITreeSet)
    def _convertBTrees(self, threshold=200):
        Lexicon._convertBTrees(self, threshold)
        if type(self._digrams) is OOBTree: return

        from BTrees.convert import convert

        _digrams=self._digrams
        self._digrams=OOBTree()
        self._digrams._p_jar=self._p_jar
        convert(_digrams, self._digrams, threshold, IITreeSet)
示例#3
0
def main():
    try:
        if sys.argv[1]=="test":
            name="test"
            treebank="test"
            comments="This parser was generated to test code"


    except:
        name=raw_input("What is the name of the parser?  :")
        treebank=raw_input("Which treebank do you want to use?  :")
        comments=raw_input("Any comments?  :")
    
    
    path="experiments/"+name+"/"    
    os.system("mkdir "+path[:-1])
    now=datetime.datetime.now()
    


    log=open(path+"log","w")
    log.write("++++Parser Information++++\n")
    log.write("name: "+name+"\n")
    log.write("based on treebank: "+treebank+"\n")
    log.write("time: "+str(now)+"\n")
    log.write("comments:\n"+comments+"\n")

    print "working..."
    os.system("notify-send \"building parsetable\"")

    os.system("./TOPify.py treebank/"+treebank+" "+path+"raw")


    lex=Lexicon()
    lex.extractFromTreebank(path+"raw",path)
    lex.save(path+"lexicon")
    pt=ParseTable()
    pt.generateFromTreeBank(path+"treebank")
    #pt.texfile(False,path)
    pt.csv(path)
    pt.save(path+"parsetable")
    
    
    log.write("++Parsetable Stats++\n")
    log.write(pt.stats())



    log.close()

    print "done!!!"
    
    os.system("notify-send \"Parsetable done.\"")
示例#4
0
def main():
    parserName=sys.argv[1]
    corpusName=sys.argv[2]
    pt=ParseTable()
    lex=Lexicon()
    print "loading parsetable....."
    pt.load("experiments/"+parserName+"/parsetable.pt")
    lex.load("experiments/"+parserName+"/lexicon.lex")
    print "done"
    p=Parser(pt,lex)
    corpus=open(corpusName,"r").readlines()
    if corpus[-1]=="\n":
        corpus=corpus[:-1]
    for line in corpus:
         print line
         p.parse(line)
示例#5
0
def parse_json(json_input):
    """

    :param string:
    :return:
    """
    # load const tree
    const_tree = ConstTree.from_string(json_input['const_tree'])
    # load grammar
    grammar_list = [Grammar.from_string(i) for i in json_input['grammar']]
    # load lexicon list
    for i in json_input['lexicon']:
        Lexicon.from_string(i)
    word_set = Lexicon.lexicon_dict
    root = generate_f_strcuture(const_tree, grammar_list, word_set)
    output_string = f_structure_to_xml(json_input['const_tree'], root)

    return output_string
class TestLexicon(TestCase):
    def setUp(self):
        self.lexicon = Lexicon()

    def test_add_to_string(self):
        try:
            self.lexicon.add_to_string("hello")
        except:
            self.fail("Exception thrown when adding to lexicon strings.")

    def test_to_string(self):
        self.lexicon.add_to_string("test thing")
        self.lexicon.add_to_string("more test thing")
        print self.lexicon.to_string()
        self.assertEquals(self.lexicon.to_string(), "test thing more test thing")
def createCategoricalPreprocessorAndLexicons(lexiconPath, bugReportDatabase):
    # Define Filters and set preprocessing steps
    basicFilter = [
        preprocessing.TransformLowerCaseFilter(),
    ]

    lexiconJsons = json.load(open(lexiconPath))
    productLexicon = Lexicon.fromList(lexiconJsons['product'], True, 'product')
    severityLexicon = Lexicon.fromList(lexiconJsons['bug_severity'], True,
                                       'bug_severity')
    componentLexicon = Lexicon.fromList(lexiconJsons['component'], True,
                                        'component')
    priorityLexicon = Lexicon.fromList(lexiconJsons['priority'], True,
                                       'priority')

    categoricalArgs = [
        ('product', productLexicon, basicFilter),
        ('bug_severity', severityLexicon, basicFilter),
        ('component', componentLexicon, basicFilter),
        ('priority', priorityLexicon, basicFilter),
        # BasicFieldPreprocessor('version', versionLexicon, basicFilter + [TransformNumberToZeroFilter()]),
    ]

    str = "Field name and Lexicon size: "

    for f, l, _ in categoricalArgs:
        str += "{} {}; ".format(f, l.getLen())

    logging.getLogger().info(str)

    lexicons = [
        productLexicon, severityLexicon, componentLexicon, priorityLexicon
    ]

    return preprocessing.CategoricalPreprocessor(categoricalArgs,
                                                 bugReportDatabase), lexicons
示例#8
0
    def getLexicon(self, vocab_id=None):
        """Get the Lexicon in use.
        """

        if self._lexicon is None:
            ## if no lexicon is provided, create a default one
            try:

                if self.catalog is None:
                    self.catalog = self.aq_inner.aq_parent.aq_base

                self._lexicon = getattr(self.catalog,
                                        self.vocabulary_id).getLexicon()
            except:
                self._lexicon = Lexicon()
                self.vocabulary_id = '__intern__'

        return self._lexicon
示例#9
0
def load_embedding(opts, paddingSym):
    if opts["lexicon"]:
        emb = np.load(opts["word_embedding"])

        lexicon = Lexicon(unknownSymbol=None)
        with codecs.open(opts["lexicon"]) as f:
            for l in f:
                lexicon.put(l.strip())

        lexicon.setUnknown("UUUKNNN")
        paddingId = lexicon.getLexiconIndex(paddingSym)
        embedding = Embedding(lexicon, emb, paddingIdx=paddingId)
    elif opts["word_embedding"]:
        # todo: Allow use embeddings and other representation
        lexicon, embedding = Embedding.fromFile(opts['word_embedding'],
                                                'UUUKNNN',
                                                hasHeader=False,
                                                paddingSym=paddingSym)

    return lexicon, embedding
示例#10
0
def adicicao(dadosRE, palavra, indice):
    palavra = re.sub('[0-9*]', '', palavra)
    if (dadosRE.has_key(palavra)):
        contem= False
        for c in range(len(dadosRe[palavra].getListaPost())):
            if(dadosRe[palavra].getListaPost()[c].getDoc()== indice):
                dadosRe[palavra].getListaPost()[c].setValor(dadosRe[palavra].getListaPost()[c].getValor()+1)
                dadosRe[palavra].setFrequencia(dadosRe[palavra].getFrequencia()+1)
                contem= True
        if(contem==False):
            posting = Postings(indice, 1)
            dadosRe[palavra].getListaPost().append(posting)
            dadosRe[palavra].setFrequencia(dadosRe[palavra].getFrequencia() + 1)
            dadosRe[palavra].setnumeroDoc(dadosRe[palavra].getnumeroDoc()+1)
            dadosRe[palavra].setidf(math.log10((totalDoc+1)/(dadosRe[palavra].getnumeroDoc())))
    else:
        posting = Postings(indice,1)
        lexicon = Lexicon(1,1,0,[])
        type(lexicon.getListaPost())
        lexicon.getListaPost().append(posting)
        lexicon.setidf(math.log10((totalDoc+1)/(lexicon.getnumeroDoc())))
        dadosRe[palavra] = lexicon
示例#11
0
    def __init__(self, lexicon_path, database, cache=None):
        super(DBR_CNN_CategoricalPreprocessor, self).__init__(database, cache)

        lexicons = json.load(open(lexicon_path))
        self.component_lexicon = Lexicon.fromList(lexicons['component'][1:], False, 'component')
        self.priorityLexicon = Lexicon.fromList(sorted(lexicons['priority'][1:], reverse=True), False, 'priority')
示例#12
0
from Sentence import Sentence
from Lexicon import Lexicon
from Feature import Feature
from getSample import *

class Main:
	def __init__(self,test_file,lexicon):
		self.sentences = []
		lines = open(test_file,"r").readlines()
		for line in lines:
			line = line.strip()
			self.sentences.append(Sentence(line,lexicon))

	def output(self,file_name):
		for sen in self.sentences:
			sen.output(file_name)
			getSample(sen)

if __name__ == '__main__': 
	lexicon = Lexicon()
	m = Main("./test/annotation_test.txt",lexicon)
	m.output('./Evaluation/tmp.txt')
__author__ = 'ryancraig'

from Lexicon import Lexicon
import Reddit_Poster_Bot
import markovify
import time

lexicon = Lexicon()

#generate Markov chain
text_model = markovify.Text(lexicon.get_words_as_string())
num_posts = 1
for i in range(num_posts):
    print "Submitting {0} of {1}".format(str(i + 1), str(num_posts))
    Reddit_Poster_Bot.submit_to_reddit(text_model.make_short_sentence(50), text_model.make_sentence())
    time.sleep(3)
示例#14
0
    def fromFile(file,
                 unknownSymbol,
                 lexiconName=None,
                 hasHeader=True,
                 paddingSym=None):
        """
        Creates  a lexicon and a embedding from word2vec file.
        :param file: path of file
        :param unknownSymbol: the string that represents the unknown words.
        :return: (data.Lexicon.Lexicon, Embedding)
        """
        log = logging.getLogger(__name__)
        fVec = codecs.open(file, 'r', 'utf-8')

        # Read the number of words in the dictionary and the embedding size
        if hasHeader:
            nmWords, embeddingSizeStr = fVec.readline().strip().split(" ")
            embeddingSize = int(embeddingSizeStr)
        else:
            embeddingSize = None

        lexicon = Lexicon(unknownSymbol, lexiconName)
        # The empty array represents the array of unknown
        # At end, this array will be replaced by one array that exist in the  w2vFile or a random array.
        vectors = [[]]
        nmEmptyWords = 0

        for line in fVec:
            splitLine = line.rstrip().split(u' ')
            word = splitLine[0]

            if len(word) == 0:
                log.warning(
                    "Insert in the embedding a empty string. This embeddings will be thrown out."
                )
                nmEmptyWords += 1
                continue

            vec = [float(num) for num in splitLine[1:]]

            if word == unknownSymbol:
                if len(vectors[0]) != 0:
                    raise Exception("A unknown symbol was already inserted.")

                vectors[0] = vec
            else:
                lexicon.put(word)
                vectors.append(vec)

        expected_size = lexicon.getLen() - 1 + nmEmptyWords

        if len(vectors[0]) == 0:
            if embeddingSize is None:
                embeddingSize = len(vectors[-1])

            vectors[0] = generateVector(embeddingSize)
            expected_size += 1

        if hasHeader:
            if int(nmWords) != expected_size:
                raise Exception(
                    "The size of lexicon is different of number of vectors")

        if paddingSym is None:
            paddingIdx = None
        else:
            if not lexicon.exist(paddingSym):
                paddingIdx = lexicon.put(paddingSym)
                vectors.append([0.0] * embeddingSize)
            else:
                paddingIdx = lexicon.getLexiconIndex(paddingSym)

        fVec.close()

        return lexicon, Embedding(lexicon, vectors, paddingIdx=paddingIdx)
 def setUp(self):
     self.lexicon = Lexicon()