def _convertBTrees(self, threshold=200): Lexicon._convertBTrees(self, threshold) if type(self._digrams) is OOBTree: return from BTrees.convert import convert _digrams = self._digrams self._digrams = OOBTree() self._digrams._p_jar = self._p_jar convert(_digrams, self._digrams, threshold, IITreeSet)
def _convertBTrees(self, threshold=200): Lexicon._convertBTrees(self, threshold) if type(self._digrams) is OOBTree: return from BTrees.convert import convert _digrams=self._digrams self._digrams=OOBTree() self._digrams._p_jar=self._p_jar convert(_digrams, self._digrams, threshold, IITreeSet)
def main(): try: if sys.argv[1]=="test": name="test" treebank="test" comments="This parser was generated to test code" except: name=raw_input("What is the name of the parser? :") treebank=raw_input("Which treebank do you want to use? :") comments=raw_input("Any comments? :") path="experiments/"+name+"/" os.system("mkdir "+path[:-1]) now=datetime.datetime.now() log=open(path+"log","w") log.write("++++Parser Information++++\n") log.write("name: "+name+"\n") log.write("based on treebank: "+treebank+"\n") log.write("time: "+str(now)+"\n") log.write("comments:\n"+comments+"\n") print "working..." os.system("notify-send \"building parsetable\"") os.system("./TOPify.py treebank/"+treebank+" "+path+"raw") lex=Lexicon() lex.extractFromTreebank(path+"raw",path) lex.save(path+"lexicon") pt=ParseTable() pt.generateFromTreeBank(path+"treebank") #pt.texfile(False,path) pt.csv(path) pt.save(path+"parsetable") log.write("++Parsetable Stats++\n") log.write(pt.stats()) log.close() print "done!!!" os.system("notify-send \"Parsetable done.\"")
def main(): parserName=sys.argv[1] corpusName=sys.argv[2] pt=ParseTable() lex=Lexicon() print "loading parsetable....." pt.load("experiments/"+parserName+"/parsetable.pt") lex.load("experiments/"+parserName+"/lexicon.lex") print "done" p=Parser(pt,lex) corpus=open(corpusName,"r").readlines() if corpus[-1]=="\n": corpus=corpus[:-1] for line in corpus: print line p.parse(line)
def parse_json(json_input): """ :param string: :return: """ # load const tree const_tree = ConstTree.from_string(json_input['const_tree']) # load grammar grammar_list = [Grammar.from_string(i) for i in json_input['grammar']] # load lexicon list for i in json_input['lexicon']: Lexicon.from_string(i) word_set = Lexicon.lexicon_dict root = generate_f_strcuture(const_tree, grammar_list, word_set) output_string = f_structure_to_xml(json_input['const_tree'], root) return output_string
class TestLexicon(TestCase): def setUp(self): self.lexicon = Lexicon() def test_add_to_string(self): try: self.lexicon.add_to_string("hello") except: self.fail("Exception thrown when adding to lexicon strings.") def test_to_string(self): self.lexicon.add_to_string("test thing") self.lexicon.add_to_string("more test thing") print self.lexicon.to_string() self.assertEquals(self.lexicon.to_string(), "test thing more test thing")
def createCategoricalPreprocessorAndLexicons(lexiconPath, bugReportDatabase): # Define Filters and set preprocessing steps basicFilter = [ preprocessing.TransformLowerCaseFilter(), ] lexiconJsons = json.load(open(lexiconPath)) productLexicon = Lexicon.fromList(lexiconJsons['product'], True, 'product') severityLexicon = Lexicon.fromList(lexiconJsons['bug_severity'], True, 'bug_severity') componentLexicon = Lexicon.fromList(lexiconJsons['component'], True, 'component') priorityLexicon = Lexicon.fromList(lexiconJsons['priority'], True, 'priority') categoricalArgs = [ ('product', productLexicon, basicFilter), ('bug_severity', severityLexicon, basicFilter), ('component', componentLexicon, basicFilter), ('priority', priorityLexicon, basicFilter), # BasicFieldPreprocessor('version', versionLexicon, basicFilter + [TransformNumberToZeroFilter()]), ] str = "Field name and Lexicon size: " for f, l, _ in categoricalArgs: str += "{} {}; ".format(f, l.getLen()) logging.getLogger().info(str) lexicons = [ productLexicon, severityLexicon, componentLexicon, priorityLexicon ] return preprocessing.CategoricalPreprocessor(categoricalArgs, bugReportDatabase), lexicons
def getLexicon(self, vocab_id=None): """Get the Lexicon in use. """ if self._lexicon is None: ## if no lexicon is provided, create a default one try: if self.catalog is None: self.catalog = self.aq_inner.aq_parent.aq_base self._lexicon = getattr(self.catalog, self.vocabulary_id).getLexicon() except: self._lexicon = Lexicon() self.vocabulary_id = '__intern__' return self._lexicon
def load_embedding(opts, paddingSym): if opts["lexicon"]: emb = np.load(opts["word_embedding"]) lexicon = Lexicon(unknownSymbol=None) with codecs.open(opts["lexicon"]) as f: for l in f: lexicon.put(l.strip()) lexicon.setUnknown("UUUKNNN") paddingId = lexicon.getLexiconIndex(paddingSym) embedding = Embedding(lexicon, emb, paddingIdx=paddingId) elif opts["word_embedding"]: # todo: Allow use embeddings and other representation lexicon, embedding = Embedding.fromFile(opts['word_embedding'], 'UUUKNNN', hasHeader=False, paddingSym=paddingSym) return lexicon, embedding
def adicicao(dadosRE, palavra, indice): palavra = re.sub('[0-9*]', '', palavra) if (dadosRE.has_key(palavra)): contem= False for c in range(len(dadosRe[palavra].getListaPost())): if(dadosRe[palavra].getListaPost()[c].getDoc()== indice): dadosRe[palavra].getListaPost()[c].setValor(dadosRe[palavra].getListaPost()[c].getValor()+1) dadosRe[palavra].setFrequencia(dadosRe[palavra].getFrequencia()+1) contem= True if(contem==False): posting = Postings(indice, 1) dadosRe[palavra].getListaPost().append(posting) dadosRe[palavra].setFrequencia(dadosRe[palavra].getFrequencia() + 1) dadosRe[palavra].setnumeroDoc(dadosRe[palavra].getnumeroDoc()+1) dadosRe[palavra].setidf(math.log10((totalDoc+1)/(dadosRe[palavra].getnumeroDoc()))) else: posting = Postings(indice,1) lexicon = Lexicon(1,1,0,[]) type(lexicon.getListaPost()) lexicon.getListaPost().append(posting) lexicon.setidf(math.log10((totalDoc+1)/(lexicon.getnumeroDoc()))) dadosRe[palavra] = lexicon
def __init__(self, lexicon_path, database, cache=None): super(DBR_CNN_CategoricalPreprocessor, self).__init__(database, cache) lexicons = json.load(open(lexicon_path)) self.component_lexicon = Lexicon.fromList(lexicons['component'][1:], False, 'component') self.priorityLexicon = Lexicon.fromList(sorted(lexicons['priority'][1:], reverse=True), False, 'priority')
from Sentence import Sentence from Lexicon import Lexicon from Feature import Feature from getSample import * class Main: def __init__(self,test_file,lexicon): self.sentences = [] lines = open(test_file,"r").readlines() for line in lines: line = line.strip() self.sentences.append(Sentence(line,lexicon)) def output(self,file_name): for sen in self.sentences: sen.output(file_name) getSample(sen) if __name__ == '__main__': lexicon = Lexicon() m = Main("./test/annotation_test.txt",lexicon) m.output('./Evaluation/tmp.txt')
__author__ = 'ryancraig' from Lexicon import Lexicon import Reddit_Poster_Bot import markovify import time lexicon = Lexicon() #generate Markov chain text_model = markovify.Text(lexicon.get_words_as_string()) num_posts = 1 for i in range(num_posts): print "Submitting {0} of {1}".format(str(i + 1), str(num_posts)) Reddit_Poster_Bot.submit_to_reddit(text_model.make_short_sentence(50), text_model.make_sentence()) time.sleep(3)
def fromFile(file, unknownSymbol, lexiconName=None, hasHeader=True, paddingSym=None): """ Creates a lexicon and a embedding from word2vec file. :param file: path of file :param unknownSymbol: the string that represents the unknown words. :return: (data.Lexicon.Lexicon, Embedding) """ log = logging.getLogger(__name__) fVec = codecs.open(file, 'r', 'utf-8') # Read the number of words in the dictionary and the embedding size if hasHeader: nmWords, embeddingSizeStr = fVec.readline().strip().split(" ") embeddingSize = int(embeddingSizeStr) else: embeddingSize = None lexicon = Lexicon(unknownSymbol, lexiconName) # The empty array represents the array of unknown # At end, this array will be replaced by one array that exist in the w2vFile or a random array. vectors = [[]] nmEmptyWords = 0 for line in fVec: splitLine = line.rstrip().split(u' ') word = splitLine[0] if len(word) == 0: log.warning( "Insert in the embedding a empty string. This embeddings will be thrown out." ) nmEmptyWords += 1 continue vec = [float(num) for num in splitLine[1:]] if word == unknownSymbol: if len(vectors[0]) != 0: raise Exception("A unknown symbol was already inserted.") vectors[0] = vec else: lexicon.put(word) vectors.append(vec) expected_size = lexicon.getLen() - 1 + nmEmptyWords if len(vectors[0]) == 0: if embeddingSize is None: embeddingSize = len(vectors[-1]) vectors[0] = generateVector(embeddingSize) expected_size += 1 if hasHeader: if int(nmWords) != expected_size: raise Exception( "The size of lexicon is different of number of vectors") if paddingSym is None: paddingIdx = None else: if not lexicon.exist(paddingSym): paddingIdx = lexicon.put(paddingSym) vectors.append([0.0] * embeddingSize) else: paddingIdx = lexicon.getLexiconIndex(paddingSym) fVec.close() return lexicon, Embedding(lexicon, vectors, paddingIdx=paddingIdx)
def setUp(self): self.lexicon = Lexicon()